evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,206 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import json
3
+ import os
4
+ import re
5
+ from tqdm import tqdm
6
+ from typing import List
2
7
 
3
- # flake8: noqa
8
+ from evalscope.benchmarks.data_adapter import DataAdapter
9
+ from evalscope.metrics.metrics import weighted_mean
10
+ from evalscope.tools.combine_reports import gen_table
11
+ from evalscope.utils import normalize_score
12
+ from evalscope.utils.logger import get_logger
4
13
 
14
+ logger = get_logger()
5
15
 
6
16
  DATASET_ID = 'modelscope/humaneval'
7
17
  SUBSET_LIST = ['openai_humaneval']
8
18
 
9
- # Note: ONLY FOR CLASS IMPORT, No implementation here.
10
-
11
19
  # Example:
12
- # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
20
+ # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
13
21
 
14
22
 
15
- class HumanevalAdapter:
23
+ class HumanevalAdapter(DataAdapter):
16
24
  """
17
25
  A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
18
26
  """
19
27
 
20
- def __init__(self):
21
- ...
28
+ def __init__(self,
29
+ subset_list: list = None,
30
+ metric_list: list = None,
31
+ few_shot_num: int = None,
32
+ train_split: str = None,
33
+ eval_split: str = 'test',
34
+ prompt_template: str = 'Complete the following python code:\n',
35
+ **kwargs):
36
+ try:
37
+ from human_eval.data import stream_jsonl, write_jsonl
38
+ from human_eval.evaluation import check_correctness
39
+ except ImportError:
40
+ raise ImportError('Please install human_eval:'
41
+ 'https://github.com/openai/human-eval/tree/master#installation , '
42
+ 'Note that you need to enable the execution code in the human_eval/execution.py first.')
43
+
44
+ if subset_list is None:
45
+ subset_list = SUBSET_LIST
46
+
47
+ if metric_list is None:
48
+ metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
49
+
50
+ self.k = [1]
51
+ self.num_workers = 4
52
+ self.timeout = 4.0
53
+ self.outputs = kwargs.get('outputs', None)
54
+
55
+ self.read_problems_func = stream_jsonl
56
+ self.write_jsonl_func = write_jsonl
57
+ self.eval_func = check_correctness
58
+
59
+ super().__init__(
60
+ subset_list=subset_list,
61
+ metric_list=metric_list,
62
+ few_shot_num=few_shot_num,
63
+ train_split=train_split,
64
+ eval_split=eval_split,
65
+ prompt_template=prompt_template,
66
+ **kwargs)
67
+
68
+ def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
69
+ data_dict = {}
70
+ for subset_name in subset_list:
71
+ data_dict[subset_name] = {}
72
+ # [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
73
+ data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
74
+
75
+ return data_dict
76
+
77
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
78
+ """
79
+ Generate prompt for the model.
80
+
81
+ Args:
82
+ input_d (dict): The raw input. A single data format of the Humaneval:
83
+ {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
84
+ """
85
+ full_prompt = input_d['prompt']
86
+ full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
87
+
88
+ return {'data': [full_prompt]}
89
+
90
+ def get_answers(self, infer_cfg: dict) -> List[dict]:
91
+ ans_list: list = []
92
+ system_prompt: str = ''
93
+ for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
94
+ prompt: str = system_prompt + data_d['prompt']
95
+ inputs: dict = {'data': [prompt]}
96
+
97
+ pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
98
+
99
+ pred_ans: str = pred_res['choices'][0]['message']['content']
100
+ pred_ans = self._postprocess(pred_ans)
101
+
102
+ ans_list.append({'task_id': task_id, 'completion': pred_ans})
103
+
104
+ return ans_list
105
+
106
+ def eval(self, infer_cfg: dict, **kwargs):
107
+
108
+ # predict
109
+ ans_list: list = self.get_answers(infer_cfg)
110
+ ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
111
+
112
+ self.write_jsonl_func(filename=ans_out_file, data=ans_list)
113
+ # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
114
+ logger.info('** Dump predictions successfully.')
115
+
116
+ # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
117
+ results = self.eval_func(
118
+ sample_file=ans_out_file,
119
+ k=self.k,
120
+ n_workers=self.num_workers,
121
+ timeout=self.timeout,
122
+ problem_file=self.problem_file)
123
+
124
+ # output: report
125
+ report_map: dict = self.gen_report(results=results)
126
+ report_dir: str = self.outputs_structure.reports_dir
127
+ report_file: str = os.path.join(report_dir, 'human_eval_report.json')
128
+
129
+ with open(report_file, 'w') as f:
130
+ f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
131
+ # logger.info(f'** Dump report to {report_file} \n')
132
+ logger.info('** Dump report \n')
133
+
134
+ try:
135
+ # Make table
136
+ report_table: str = gen_table([report_dir])
137
+ logger.info(f'** Report table: \n {report_table} \n')
138
+ except Exception:
139
+ logger.error('Failed to generate report table.')
140
+
141
+ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
142
+ total_num: int = sum([num for _, num in subset_score_map.values()])
143
+ weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
144
+ weighted_avg_acc = normalize_score(score=weighted_avg_acc)
145
+ cate_avg_list = [{
146
+ 'name': subset_name,
147
+ 'score': normalize_score(score=score)
148
+ } for subset_name, (score, _) in subset_score_map.items()]
149
+
150
+ category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
151
+
152
+ res_map = dict(
153
+ name=report_name or 'HumanEval',
154
+ metric='pass@1',
155
+ score=weighted_avg_acc,
156
+ category=[category_d],
157
+ total_num=total_num)
158
+
159
+ return res_map
160
+
161
+ @classmethod
162
+ def _postprocess(cls, text: str) -> str:
163
+ if '```' in text:
164
+ blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
165
+ if len(blocks) == 0:
166
+ text = text.split('```')[1] # fall back to default strategy
167
+ else:
168
+ text = blocks[0] # fetch the first code block
169
+ if not text.startswith('\n'): # in case starting with ```python
170
+ text = text[max(text.find('\n') + 1, 0):]
171
+ if text.strip().startswith('from') or text.strip().startswith('import'):
172
+ def_idx = text.find('def')
173
+ if def_idx != -1:
174
+ text = text[max(text.find('\n', def_idx) + 1, 0):]
175
+ text = text.split('\n\n')[0]
176
+ if text.strip().startswith('def'):
177
+ text = '\n'.join(text.split('\n')[1:])
178
+ if not text.startswith(' '):
179
+ if text.startswith(' '):
180
+ text = ' ' + text.lstrip()
181
+ else:
182
+ text = '\n'.join([' ' + line for line in text.split('\n')])
183
+ return text
184
+
185
+ def compute_metric(self, review_res_list: list) -> float:
186
+ """
187
+ Compute evaluation result by specific metric.
188
+
189
+ Args:
190
+ review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
191
+
192
+ Returns:
193
+ The metric score.
194
+ """
195
+ items = [(score, 1.0) for score in review_res_list]
196
+ return weighted_mean(items)
197
+
198
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
199
+ return self._postprocess(result)
200
+
201
+ def get_gold_answer(self, input_d: dict) -> str:
202
+ return input_d
203
+
204
+ def match(self, gold: str, pred: str) -> float:
205
+ res = self.eval_func(gold, pred, self.timeout)
206
+ return float(res['passed'])
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST, MMLUAdapter
3
+ from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
4
+ from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter
4
5
  from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
6
+ from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
@@ -1,3 +1,4 @@
1
+ # isort: skip_file
1
2
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
3
  # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
4
  #
@@ -14,14 +15,11 @@
14
15
  # limitations under the License.
15
16
  # flake8: noqa
16
17
 
17
- import os
18
-
19
18
  import datasets
19
+ import os
20
20
  import pandas as pd
21
-
22
21
  """The MMLU dataset on ModelScope hub. READ ONLY, DO NOT MODIFY."""
23
22
 
24
-
25
23
  _CITATION = """\
26
24
  @article{hendryckstest2021,
27
25
  title={Measuring Massive Multitask Language Understanding},
@@ -105,29 +103,23 @@ task_list = [
105
103
 
106
104
 
107
105
  class MMLUConfig(datasets.BuilderConfig):
106
+
108
107
  def __init__(self, **kwargs):
109
108
  super().__init__(version=datasets.Version('1.0.0'), **kwargs)
110
109
 
111
110
 
112
111
  class MMLU(datasets.GeneratorBasedBuilder):
113
- BUILDER_CONFIGS = [
114
- MMLUConfig(
115
- name=task_name,
116
- )
117
- for task_name in task_list
118
- ]
112
+ BUILDER_CONFIGS = [MMLUConfig(name=task_name, ) for task_name in task_list]
119
113
 
120
114
  def _info(self):
121
- features = datasets.Features(
122
- {
123
- 'input': datasets.Value('string'),
124
- 'A': datasets.Value('string'),
125
- 'B': datasets.Value('string'),
126
- 'C': datasets.Value('string'),
127
- 'D': datasets.Value('string'),
128
- 'target': datasets.Value('string'),
129
- }
130
- )
115
+ features = datasets.Features({
116
+ 'input': datasets.Value('string'),
117
+ 'A': datasets.Value('string'),
118
+ 'B': datasets.Value('string'),
119
+ 'C': datasets.Value('string'),
120
+ 'D': datasets.Value('string'),
121
+ 'target': datasets.Value('string'),
122
+ })
131
123
  return datasets.DatasetInfo(
132
124
  description=_DESCRIPTION,
133
125
  features=features,
@@ -143,25 +135,19 @@ class MMLU(datasets.GeneratorBasedBuilder):
143
135
  datasets.SplitGenerator(
144
136
  name=datasets.Split.TEST,
145
137
  gen_kwargs={
146
- 'filepath': os.path.join(
147
- data_dir, 'data', 'test', f'{task_name}_test.csv'
148
- ),
138
+ 'filepath': os.path.join(data_dir, 'data', 'test', f'{task_name}_test.csv'),
149
139
  },
150
140
  ),
151
141
  datasets.SplitGenerator(
152
142
  name=datasets.Split.VALIDATION,
153
143
  gen_kwargs={
154
- 'filepath': os.path.join(
155
- data_dir, 'data', 'val', f'{task_name}_val.csv'
156
- ),
144
+ 'filepath': os.path.join(data_dir, 'data', 'val', f'{task_name}_val.csv'),
157
145
  },
158
146
  ),
159
147
  datasets.SplitGenerator(
160
148
  name=datasets.Split.TRAIN,
161
149
  gen_kwargs={
162
- 'filepath': os.path.join(
163
- data_dir, 'data', 'dev', f'{task_name}_dev.csv'
164
- ),
150
+ 'filepath': os.path.join(data_dir, 'data', 'dev', f'{task_name}_dev.csv'),
165
151
  },
166
152
  ),
167
153
  ]
@@ -4,8 +4,9 @@ import os
4
4
 
5
5
  from evalscope.benchmarks.data_adapter import DataAdapter
6
6
  from evalscope.metrics.metrics import exact_match, weighted_mean
7
- from evalscope.utils import normalize_score, ResponseParser
7
+ from evalscope.utils import ResponseParser, normalize_score
8
8
  from evalscope.utils.logger import get_logger
9
+
9
10
  # flake8: noqa
10
11
 
11
12
  logger = get_logger()
@@ -72,65 +73,65 @@ SUBSET_LIST = [
72
73
  'college_biology',
73
74
  ]
74
75
 
75
-
76
- SUBJECT_MAPPING = {'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
77
- 'anatomy': ['Anatomy', 'health', 'Other'],
78
- 'astronomy': ['Astronomy', 'physics', 'STEM'],
79
- 'business_ethics': ['Business Ethics', 'business', 'Other'],
80
- 'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
81
- 'college_biology': ['College Biology', 'biology', 'STEM'],
82
- 'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
83
- 'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
84
- 'college_mathematics': ['College Mathematics', 'math', 'STEM'],
85
- 'college_medicine': ['College Medicine', 'health', 'Other'],
86
- 'college_physics': ['College Physics', 'physics', 'STEM'],
87
- 'computer_security': ['Computer Security', 'computer science', 'STEM'],
88
- 'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
89
- 'econometrics': ['Econometrics', 'economics', 'Social Science'],
90
- 'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
91
- 'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
92
- 'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
93
- 'global_facts': ['Global Facts', 'other', 'Other'],
94
- 'high_school_biology': ['High School Biology', 'biology', 'STEM'],
95
- 'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
96
- 'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
97
- 'high_school_european_history': ['High School European History', 'history', 'Humanities'],
98
- 'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
99
- 'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
100
- 'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
101
- 'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
102
- 'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
103
- 'high_school_physics': ['High School Physics', 'physics', 'STEM'],
104
- 'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
105
- 'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
106
- 'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
107
- 'high_school_world_history': ['High School World History', 'history', 'Humanities'],
108
- 'human_aging': ['Human Aging', 'health', 'Other'],
109
- 'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
110
- 'international_law': ['International Law', 'law', 'Humanities'],
111
- 'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
112
- 'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
113
- 'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
114
- 'management': ['Management', 'business', 'Other'],
115
- 'marketing': ['Marketing', 'business', 'Other'],
116
- 'medical_genetics': ['Medical Genetics', 'health', 'Other'],
117
- 'miscellaneous': ['Miscellaneous', 'other', 'Other'],
118
- 'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
119
- 'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
120
- 'nutrition': ['Nutrition', 'health', 'Other'],
121
- 'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
122
- 'prehistory': ['Prehistory', 'history', 'Humanities'],
123
- 'professional_accounting': ['Professional Accounting', 'other', 'Other'],
124
- 'professional_law': ['Professional Law', 'law', 'Humanities'],
125
- 'professional_medicine': ['Professional Medicine', 'health', 'Other'],
126
- 'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
127
- 'public_relations': ['Public Relations', 'politics', 'Social Science'],
128
- 'security_studies': ['Security Studies', 'politics', 'Social Science'],
129
- 'sociology': ['Sociology', 'culture', 'Social Science'],
130
- 'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
131
- 'virology': ['Virology', 'health', 'Other'],
132
- 'world_religions': ['World Religions', 'philosophy', 'Humanities'],
133
- }
76
+ SUBJECT_MAPPING = {
77
+ 'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
78
+ 'anatomy': ['Anatomy', 'health', 'Other'],
79
+ 'astronomy': ['Astronomy', 'physics', 'STEM'],
80
+ 'business_ethics': ['Business Ethics', 'business', 'Other'],
81
+ 'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
82
+ 'college_biology': ['College Biology', 'biology', 'STEM'],
83
+ 'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
84
+ 'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
85
+ 'college_mathematics': ['College Mathematics', 'math', 'STEM'],
86
+ 'college_medicine': ['College Medicine', 'health', 'Other'],
87
+ 'college_physics': ['College Physics', 'physics', 'STEM'],
88
+ 'computer_security': ['Computer Security', 'computer science', 'STEM'],
89
+ 'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
90
+ 'econometrics': ['Econometrics', 'economics', 'Social Science'],
91
+ 'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
92
+ 'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
93
+ 'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
94
+ 'global_facts': ['Global Facts', 'other', 'Other'],
95
+ 'high_school_biology': ['High School Biology', 'biology', 'STEM'],
96
+ 'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
97
+ 'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
98
+ 'high_school_european_history': ['High School European History', 'history', 'Humanities'],
99
+ 'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
100
+ 'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
101
+ 'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
102
+ 'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
103
+ 'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
104
+ 'high_school_physics': ['High School Physics', 'physics', 'STEM'],
105
+ 'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
106
+ 'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
107
+ 'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
108
+ 'high_school_world_history': ['High School World History', 'history', 'Humanities'],
109
+ 'human_aging': ['Human Aging', 'health', 'Other'],
110
+ 'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
111
+ 'international_law': ['International Law', 'law', 'Humanities'],
112
+ 'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
113
+ 'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
114
+ 'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
115
+ 'management': ['Management', 'business', 'Other'],
116
+ 'marketing': ['Marketing', 'business', 'Other'],
117
+ 'medical_genetics': ['Medical Genetics', 'health', 'Other'],
118
+ 'miscellaneous': ['Miscellaneous', 'other', 'Other'],
119
+ 'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
120
+ 'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
121
+ 'nutrition': ['Nutrition', 'health', 'Other'],
122
+ 'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
123
+ 'prehistory': ['Prehistory', 'history', 'Humanities'],
124
+ 'professional_accounting': ['Professional Accounting', 'other', 'Other'],
125
+ 'professional_law': ['Professional Law', 'law', 'Humanities'],
126
+ 'professional_medicine': ['Professional Medicine', 'health', 'Other'],
127
+ 'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
128
+ 'public_relations': ['Public Relations', 'politics', 'Social Science'],
129
+ 'security_studies': ['Security Studies', 'politics', 'Social Science'],
130
+ 'sociology': ['Sociology', 'culture', 'Social Science'],
131
+ 'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
132
+ 'virology': ['Virology', 'health', 'Other'],
133
+ 'world_religions': ['World Religions', 'philosophy', 'Humanities'],
134
+ }
134
135
 
135
136
 
136
137
  class MMLUAdapter(DataAdapter):
@@ -160,12 +161,13 @@ class MMLUAdapter(DataAdapter):
160
161
  logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
161
162
  few_shot_num = 5
162
163
 
163
- super().__init__(subset_list=subset_list,
164
- metric_list=metric_list,
165
- few_shot_num=few_shot_num,
166
- train_split=train_split,
167
- eval_split=eval_split,
168
- **kwargs)
164
+ super().__init__(
165
+ subset_list=subset_list,
166
+ metric_list=metric_list,
167
+ few_shot_num=few_shot_num,
168
+ train_split=train_split,
169
+ eval_split=eval_split,
170
+ **kwargs)
169
171
 
170
172
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
171
173
  data_dict = {}
@@ -227,8 +229,7 @@ class MMLUAdapter(DataAdapter):
227
229
 
228
230
  """
229
231
  prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
230
- self._format_subject(subset_name)
231
- )
232
+ self._format_subject(subset_name))
232
233
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
233
234
 
234
235
  context: str = '\n'.join(few_shot_prompts) + '\n'
@@ -335,19 +336,26 @@ class MMLUAdapter(DataAdapter):
335
336
  domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
336
337
  sum([num for _, _, num in domain_res_list])
337
338
  domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
338
- category_list.append({'name': domain_name,
339
- 'score': domain_weighted_avg_acc,
340
- 'subset': [{'name': subset_name, 'score': normalize_score(score=subset_score)}
341
- for subset_name, subset_score, _ in domain_res_list]})
339
+ category_list.append({
340
+ 'name':
341
+ domain_name,
342
+ 'score':
343
+ domain_weighted_avg_acc,
344
+ 'subset': [{
345
+ 'name': subset_name,
346
+ 'score': normalize_score(score=subset_score)
347
+ } for subset_name, subset_score, _ in domain_res_list]
348
+ })
342
349
 
343
350
  category_list = sorted(category_list, key=lambda x: x['name'])
344
351
 
345
352
  # Get final dict of report
346
- res_map = dict(name=report_name or 'mmlu',
347
- metric=self.metric_list[0]['name'],
348
- score=weighted_avg_acc,
349
- category=category_list,
350
- total_num=total_num)
353
+ res_map = dict(
354
+ name=report_name or 'mmlu',
355
+ metric=self.metric_list[0]['name'],
356
+ score=weighted_avg_acc,
357
+ category=category_list,
358
+ total_num=total_num)
351
359
 
352
360
  return res_map
353
361
 
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST, RACEAdapter
3
+ from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
4
+ from evalscope.benchmarks.race.race_adapter import RACEAdapter
4
5
  from evalscope.benchmarks.race.race_adapter import RACEAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
6
+ from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
@@ -11,12 +11,10 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import os
15
-
16
14
  import datasets
15
+ import os
17
16
  import pandas as pd
18
17
 
19
-
20
18
  _CITATION = """\
21
19
  @inproceedings{lai-etal-2017-race,
22
20
  title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
@@ -40,39 +38,33 @@ _DESCRIPTION = """\
40
38
  RACE is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions.
41
39
  """
42
40
 
43
- _HOMEPAGE = "https://modelscope.cn/datasets/modelscope/race/summary"
41
+ _HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/race/summary'
44
42
 
45
- _URL = "https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip"
43
+ _URL = 'https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip'
46
44
 
47
45
  task_list = [
48
- "high",
49
- "middle",
46
+ 'high',
47
+ 'middle',
50
48
  ]
51
49
 
52
50
 
53
51
  class RACEConfig(datasets.BuilderConfig):
52
+
54
53
  def __init__(self, **kwargs):
55
- super().__init__(version=datasets.Version("1.0.0"), **kwargs)
54
+ super().__init__(version=datasets.Version('1.0.0'), **kwargs)
56
55
 
57
56
 
58
57
  class RACE(datasets.GeneratorBasedBuilder):
59
- BUILDER_CONFIGS = [
60
- RACEConfig(
61
- name=task_name,
62
- )
63
- for task_name in task_list
64
- ]
58
+ BUILDER_CONFIGS = [RACEConfig(name=task_name, ) for task_name in task_list]
65
59
 
66
60
  def _info(self):
67
- features = datasets.Features(
68
- {
69
- "example_id": datasets.Value("string"),
70
- "article": datasets.Value("string"),
71
- "answer": datasets.Value("string"),
72
- "question": datasets.Value("string"),
73
- "options": [datasets.Value("string")],
74
- }
75
- )
61
+ features = datasets.Features({
62
+ 'example_id': datasets.Value('string'),
63
+ 'article': datasets.Value('string'),
64
+ 'answer': datasets.Value('string'),
65
+ 'question': datasets.Value('string'),
66
+ 'options': [datasets.Value('string')],
67
+ })
76
68
  return datasets.DatasetInfo(
77
69
  description=_DESCRIPTION,
78
70
  features=features,
@@ -87,32 +79,26 @@ class RACE(datasets.GeneratorBasedBuilder):
87
79
  datasets.SplitGenerator(
88
80
  name=datasets.Split.TEST,
89
81
  gen_kwargs={
90
- "filepath": os.path.join(
91
- data_dir, f"race/test/{task_name}-00000-of-00001.parquet"
92
- ),
82
+ 'filepath': os.path.join(data_dir, f'race/test/{task_name}-00000-of-00001.parquet'),
93
83
  },
94
84
  ),
95
85
  datasets.SplitGenerator(
96
86
  name=datasets.Split.VALIDATION,
97
87
  gen_kwargs={
98
- "filepath": os.path.join(
99
- data_dir, f"race/val/{task_name}-00000-of-00001.parquet"
100
- ),
88
+ 'filepath': os.path.join(data_dir, f'race/val/{task_name}-00000-of-00001.parquet'),
101
89
  },
102
90
  ),
103
91
  datasets.SplitGenerator(
104
92
  name=datasets.Split.TRAIN,
105
93
  gen_kwargs={
106
- "filepath": os.path.join(
107
- data_dir, f"race/train/{task_name}-00000-of-00001.parquet"
108
- ),
94
+ 'filepath': os.path.join(data_dir, f'race/train/{task_name}-00000-of-00001.parquet'),
109
95
  },
110
96
  ),
111
97
  ]
112
98
 
113
99
  def _generate_examples(self, filepath):
114
100
  df = pd.read_parquet(filepath)
115
- df.columns = ["example_id", "article", "answer", "question", "options"]
101
+ df.columns = ['example_id', 'article', 'answer', 'question', 'options']
116
102
 
117
- for i, instance in enumerate(df.to_dict(orient="records")):
118
- yield i, instance
103
+ for i, instance in enumerate(df.to_dict(orient='records')):
104
+ yield i, instance