evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +3 -4
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.1.dist-info/RECORD +0 -286
  230. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os.path
3
+ import random
3
4
  from abc import ABC, abstractmethod
4
5
  from typing import Any, Optional
5
- import random
6
6
 
7
7
  from evalscope.benchmarks import Benchmark
8
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR, AnswerKeys
8
+ from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
9
9
  from evalscope.utils.logger import get_logger
10
10
 
11
11
  logger = get_logger()
@@ -29,7 +29,8 @@ class DataAdapter(ABC):
29
29
  train_split: str, usually for few-shot examples. e.g. 'train'
30
30
  eval_split: str, the target eval split name. e.g. 'test'
31
31
  prompt_template: str, the prompt template for the benchmark,
32
- e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:`
32
+ e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
33
+ the form of A or B or C or D, do not output explanation:`
33
34
  """
34
35
  self.subset_list = subset_list
35
36
  self.metric_list = metric_list
@@ -42,8 +43,8 @@ class DataAdapter(ABC):
42
43
  def load(self,
43
44
  dataset_name_or_path: str,
44
45
  subset_list: list = None,
45
- work_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
46
- datasets_hub: str = 'ModelScope',
46
+ work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
47
+ datasets_hub: str = HubType.MODELSCOPE,
47
48
  **kwargs) -> dict:
48
49
  """
49
50
  Load the dataset. Remote and local datasets are supported.
@@ -54,12 +55,11 @@ class DataAdapter(ABC):
54
55
 
55
56
  """
56
57
  dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
57
- if datasets_hub == 'Local':
58
- # Try to load dataset from local disk
59
- if not os.path.exists(dataset_name_or_path):
60
- raise FileNotFoundError(f'Dataset path not found: {dataset_name_or_path}')
61
58
 
62
- logger.info(f'Loading dataset from local disk: >dataset_name: {dataset_name_or_path} >work_dir: {work_dir}')
59
+ # Try to load dataset from local disk
60
+ if os.path.exists(dataset_name_or_path):
61
+ logger.info(
62
+ f'Loading dataset from local disk: > dataset_name: {dataset_name_or_path} > work_dir: {work_dir}')
63
63
  data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
64
64
  if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
65
65
  raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
@@ -76,12 +76,13 @@ class DataAdapter(ABC):
76
76
  data_dict[sub_name] = {}
77
77
  # e.g. train: few-shot, test: target dataset to evaluate
78
78
  for split in split_list:
79
- dataset = Benchmark.load(dataset_name=dataset_name_or_path,
80
- subset=sub_name,
81
- split=split,
82
- hub=datasets_hub,
83
- work_dir=work_dir,
84
- **kwargs)
79
+ dataset = Benchmark.load(
80
+ dataset_name=dataset_name_or_path,
81
+ subset=sub_name,
82
+ split=split,
83
+ hub=datasets_hub,
84
+ work_dir=work_dir,
85
+ **kwargs)
85
86
 
86
87
  data_dict[sub_name].update({split: dataset})
87
88
 
@@ -112,19 +113,18 @@ class DataAdapter(ABC):
112
113
  if self.few_shot_num and self.few_shot_num < 0:
113
114
  raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
114
115
 
115
- logger.info(f'\n** Use default settings: \n'
116
- f'>few_shot_num: {self.few_shot_num}, '
117
- f'>few_shot_split: {self.train_split}, '
118
- f'>target_eval_split: {self.eval_split}')
116
+ logger.info(f'Use default settings: '
117
+ f'> few_shot_num: {self.few_shot_num}, '
118
+ f'> few_shot_split: {self.train_split}, '
119
+ f'> target_eval_split: {self.eval_split}')
119
120
 
120
121
  for sub_name, sub_data_dict in data_dict.items():
121
122
  few_shot_data = []
122
123
  if self.few_shot_num and self.few_shot_num > 0:
123
124
  few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
124
- few_shot_data = self.get_fewshot_examples(
125
- [item for item in sub_data_dict[self.train_split]],
126
- self.few_shot_num,
127
- few_shot_random=few_shot_random)
125
+ few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
126
+ self.few_shot_num,
127
+ few_shot_random=few_shot_random)
128
128
 
129
129
  res_dict[sub_name] = []
130
130
  for sample_d in sub_data_dict[self.eval_split]:
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST, GeneralQAAdapter
3
+ from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter
4
5
  from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
6
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
@@ -1,15 +1,15 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import glob
3
+ import json
3
4
  import os.path
5
+ from collections import defaultdict
6
+ from typing import Any, Optional
4
7
 
5
8
  from evalscope.benchmarks.data_adapter import DataAdapter
6
9
  from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
7
10
  from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
8
11
  from evalscope.utils import jsonl_to_list
9
12
  from evalscope.utils.logger import get_logger
10
- from typing import Any, Optional
11
- from collections import defaultdict
12
- import json
13
13
 
14
14
  logger = get_logger()
15
15
 
@@ -31,17 +31,11 @@ class GeneralQAAdapter(DataAdapter):
31
31
 
32
32
  if metric_list is None:
33
33
  metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
34
-
35
- super().__init__(subset_list=subset_list,
36
- metric_list=metric_list,
37
- train_split=train_split,
38
- eval_split=eval_split,
39
- **kwargs)
40
-
41
- def load(self,
42
- dataset_name_or_path: str,
43
- subset_list: list = None,
44
- **kwargs) -> dict:
34
+
35
+ super().__init__(
36
+ subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
37
+
38
+ def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
45
39
 
46
40
  data_file_list = glob.glob(os.path.join(dataset_name_or_path, '*.jsonl'))
47
41
  data_list = []
@@ -50,12 +44,12 @@ class GeneralQAAdapter(DataAdapter):
50
44
  for file_path in data_file_list:
51
45
  data_list.extend(jsonl_to_list(file_path))
52
46
  except Exception as e:
53
- raise ValueError(f"Failed to load data from {dataset_name_or_path}, got error: {e}")
47
+ raise ValueError(f'Failed to load data from {dataset_name_or_path}, got error: {e}')
54
48
 
55
49
  data_dict = {'default': {'test': data_list}}
56
50
 
57
51
  return data_dict
58
-
52
+
59
53
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
60
54
  """
61
55
  Args:
@@ -68,16 +62,17 @@ class GeneralQAAdapter(DataAdapter):
68
62
 
69
63
  """
70
64
  # prompt = f"'<|im_start|>user\n{input_d['input']}<|im_end|>\n<|im_start|>assistant\n'"
71
- history = input_d.get('history', []) # history: [['q1', 'a1'], ['q2', 'a2'], ...]
65
+ history = input_d.get('history', []) # history: [['q1', 'a1'], ['q2', 'a2'], ...]
72
66
  if len(history) > 0:
73
- logger.warning(f"The history is not included in the prompt for GeneralQA. To be supported in the future.")
67
+ logger.warning('The history is not included in the prompt for GeneralQA. \
68
+ To be supported in the future.')
74
69
 
75
70
  prompt = input_d.get('question', '') or input_d.get('query', '')
76
71
 
77
72
  # if len(history) > 0:
78
73
  # prompt = '\n'.join(history) + '\n' + prompt
79
74
  return {'data': [prompt]}
80
-
75
+
81
76
  def get_gold_answer(self, input_d: dict) -> str:
82
77
  """
83
78
  Args:
@@ -88,7 +83,7 @@ class GeneralQAAdapter(DataAdapter):
88
83
 
89
84
  """
90
85
  return input_d.get('answer', '') or input_d.get('response', '')
91
-
86
+
92
87
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
93
88
  """
94
89
  Args:
@@ -99,7 +94,7 @@ class GeneralQAAdapter(DataAdapter):
99
94
 
100
95
  """
101
96
  return result
102
-
97
+
103
98
  def match(self, gold: str, pred: str) -> float:
104
99
  """
105
100
  Args:
@@ -110,7 +105,6 @@ class GeneralQAAdapter(DataAdapter):
110
105
  bleu_score: float
111
106
 
112
107
  """
113
- item = [(gold, pred)]
114
108
  res = dict()
115
109
  rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
116
110
  bleu_dict = bleu_ngram_one_sample(pred, gold)
@@ -118,7 +112,7 @@ class GeneralQAAdapter(DataAdapter):
118
112
  res.update(bleu_dict)
119
113
  # return bleu(item)
120
114
  return res
121
-
115
+
122
116
  def compute_metric(self, review_res_list: list) -> float:
123
117
  """
124
118
  compute weighted mean of the bleu score of all samples
@@ -132,13 +126,13 @@ class GeneralQAAdapter(DataAdapter):
132
126
  """
133
127
  items = defaultdict(list)
134
128
  for scores in review_res_list:
135
- for k,v in scores.items():
129
+ for k, v in scores.items():
136
130
  items[k].append((v, 1.0))
137
131
  # items = [(score, 1.0) for score in review_res_list]
138
- res = {k: weighted_mean(v) for k,v in items.items()}
132
+ res = {k: weighted_mean(v) for k, v in items.items()}
139
133
  # return weighted_mean(items)
140
134
  return res
141
-
135
+
142
136
  def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
143
137
  """
144
138
  Args:
@@ -167,20 +161,22 @@ class GeneralQAAdapter(DataAdapter):
167
161
  """
168
162
  total_num: int = sum([num for _, num in subset_score_map.values()])
169
163
  # weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
170
- cate_avg_list = [{'name': subset_name, 'score': score_dict} for subset_name, (score_dict, _) in subset_score_map.items()]
164
+ cate_avg_list = [{
165
+ 'name': subset_name,
166
+ 'score': score_dict
167
+ } for subset_name, (score_dict, _) in subset_score_map.items()]
171
168
  total_avg_list = defaultdict(float)
172
169
  for score_dict, num in subset_score_map.values():
173
170
  for metric, score in score_dict.items():
174
171
  total_avg_list[metric] += score * num / total_num
175
172
 
176
- category_d = dict(name="DEFAULT",
177
- score=total_avg_list,
178
- subset=cate_avg_list)
179
-
180
- res_map = dict(name=report_name or "general_qa",
181
- metric=self.metric_list[0]['name'],
182
- score=total_avg_list,
183
- category=[category_d],
184
- total_num=total_num)
185
-
186
- return res_map
173
+ category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
174
+
175
+ res_map = dict(
176
+ name=report_name or 'general_qa',
177
+ metric=self.metric_list[0]['name'],
178
+ score=total_avg_list,
179
+ category=[category_d],
180
+ total_num=total_num)
181
+
182
+ return res_map
@@ -2,4 +2,4 @@
2
2
 
3
3
  from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
4
4
  from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
5
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -13,15 +13,12 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
  # flake8: noqa
16
-
17
16
  """Grade School Math 8k dataset."""
18
17
 
18
+ import datasets
19
19
  import json
20
20
  import textwrap
21
21
 
22
- import datasets
23
-
24
-
25
22
  _CITATION = """\
26
23
  @misc{cobbe2021training,
27
24
  title={Training Verifiers to Solve Math Word Problems},
@@ -76,8 +73,7 @@ class Gsm8k(datasets.GeneratorBasedBuilder):
76
73
  using basic arithmetic operations (+ - / *) to reach the final
77
74
  answer. A bright middle school student should be able to solve
78
75
  every problem.
79
- """,
80
- ),
76
+ """, ),
81
77
  urls={
82
78
  'train': TRAIN_URL,
83
79
  'test': TEST_URL,
@@ -86,12 +82,10 @@ class Gsm8k(datasets.GeneratorBasedBuilder):
86
82
  ]
87
83
 
88
84
  def _info(self):
89
- features = datasets.Features(
90
- {
91
- 'question': datasets.Value('string'),
92
- 'answer': datasets.Value('string'),
93
- }
94
- )
85
+ features = datasets.Features({
86
+ 'question': datasets.Value('string'),
87
+ 'answer': datasets.Value('string'),
88
+ })
95
89
  return datasets.DatasetInfo(
96
90
  description=_DESCRIPTION,
97
91
  features=features,
@@ -1,12 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI, Inc. and its affiliates.
3
+ import math
3
4
  import os
4
5
  import re
5
- import math
6
+
6
7
  from evalscope.benchmarks import DataAdapter
7
8
  from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import normalize_score, jsonl_to_list
9
+ from evalscope.utils import jsonl_to_list, normalize_score
9
10
  from evalscope.utils.logger import get_logger
11
+
10
12
  # flake8: noqa
11
13
 
12
14
  logger = get_logger()
@@ -54,13 +56,14 @@ class GSM8KAdapter(DataAdapter):
54
56
  f'Use 4-shot by default.')
55
57
  few_shot_num = 4
56
58
 
57
- super().__init__(subset_list=subset_list,
58
- metric_list=metric_list,
59
- few_shot_num=few_shot_num,
60
- train_split=train_split,
61
- eval_split=eval_split,
62
- prompt_template=prompt_template,
63
- **kwargs)
59
+ super().__init__(
60
+ subset_list=subset_list,
61
+ metric_list=metric_list,
62
+ few_shot_num=few_shot_num,
63
+ train_split=train_split,
64
+ eval_split=eval_split,
65
+ prompt_template=prompt_template,
66
+ **kwargs)
64
67
 
65
68
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
66
69
  data_dict = {}
@@ -182,17 +185,19 @@ class GSM8KAdapter(DataAdapter):
182
185
  total_num: int = sum([num for _, num in subset_score_map.values()])
183
186
  weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
184
187
  weighted_avg_acc = normalize_score(score=weighted_avg_acc)
185
- cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
188
+ cate_avg_list = [{
189
+ 'name': subset_name,
190
+ 'score': normalize_score(score=score)
191
+ } for subset_name, (score, _) in subset_score_map.items()]
186
192
 
187
- category_d = dict(name='DEFAULT',
188
- score=weighted_avg_acc,
189
- subset=cate_avg_list)
193
+ category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
190
194
 
191
- res_map = dict(name=report_name or 'gsm8k',
192
- metric=self.metric_list[0]['name'],
193
- score=weighted_avg_acc,
194
- category=[category_d],
195
- total_num=total_num)
195
+ res_map = dict(
196
+ name=report_name or 'gsm8k',
197
+ metric=self.metric_list[0]['name'],
198
+ score=weighted_avg_acc,
199
+ category=[category_d],
200
+ total_num=total_num)
196
201
 
197
202
  return res_map
198
203
 
@@ -209,8 +214,7 @@ class GSM8KAdapter(DataAdapter):
209
214
  "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n"
210
215
  "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n"
211
216
  'For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n'
212
- f"Question: {input_d['question']}\nLet's think step by step\nAnswer:"
213
- )
217
+ f"Question: {input_d['question']}\nLet's think step by step\nAnswer:")
214
218
  # context = input_d['question']
215
219
  # fewshot_prompts = ['Question: ' + item_d['question'] + '\nAnswer: ' + item_d['answer'] for item_d in few_shot_list]
216
220
  # fewshot_prompts = fewshot_prompts + ['Question: ' + context + '\nAnswer:']
@@ -222,9 +226,7 @@ class GSM8KAdapter(DataAdapter):
222
226
 
223
227
  @staticmethod
224
228
  def extract_answer(s: str) -> str:
225
- _PAT_LAST_DIGIT = re.compile(
226
- r'([+-])?(?=([0-9]|\.[0-9]))(0|([1-9](\d{0,2}(,\d{3})*)|\d*))?(\.\d*)?(?=\D|$)'
227
- )
229
+ _PAT_LAST_DIGIT = re.compile(r'([+-])?(?=([0-9]|\.[0-9]))(0|([1-9](\d{0,2}(,\d{3})*)|\d*))?(\.\d*)?(?=\D|$)')
228
230
  match = list(_PAT_LAST_DIGIT.finditer(s))
229
231
  if match:
230
232
  last_digit = match[-1].group().replace(',', '').replace('+', '').strip().strip('.')
@@ -233,4 +235,4 @@ class GSM8KAdapter(DataAdapter):
233
235
  last_digit = None
234
236
  print(f'No digits found in {s!r}', flush=True)
235
237
 
236
- return last_digit
238
+ return last_digit
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter, DATASET_ID, SUBSET_LIST
3
+ from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter
4
5
  from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
6
+ from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
@@ -1,20 +1,18 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
-
4
3
  """HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI.
5
4
  A paper was published at ACL2019.
6
5
  """
7
-
8
6
  """DO NOT EDIT."""
9
7
 
10
- import json
11
8
  import datasets
9
+ import json
10
+
12
11
  # flake8: noqa
13
12
 
14
13
  # HomePage: https://rowanzellers.com/hellaswag/
15
14
  # GitHub: https://github.com/rowanz/hellaswag
16
15
 
17
-
18
16
  _CITATION = """\
19
17
  @inproceedings{zellers2019hellaswag,
20
18
  title={HellaSwag: Can a Machine Really Finish Your Sentence?},
@@ -47,21 +45,19 @@ class Hellaswag(datasets.GeneratorBasedBuilder):
47
45
  # This is the description that will appear on the datasets page.
48
46
  description=_DESCRIPTION,
49
47
  # datasets.features.FeatureConnectors
50
- features=datasets.Features(
51
- {
52
- # These are the features of your dataset like images, labels ...
53
- 'ind': datasets.Value('int32'),
54
- 'activity_label': datasets.Value('string'),
55
- 'ctx_a': datasets.Value('string'),
56
- 'ctx_b': datasets.Value('string'),
57
- 'ctx': datasets.Value('string'),
58
- 'endings': datasets.features.Sequence(datasets.Value('string')),
59
- 'source_id': datasets.Value('string'),
60
- 'split': datasets.Value('string'),
61
- 'split_type': datasets.Value('string'),
62
- 'label': datasets.Value('string'),
63
- }
64
- ),
48
+ features=datasets.Features({
49
+ # These are the features of your dataset like images, labels ...
50
+ 'ind': datasets.Value('int32'),
51
+ 'activity_label': datasets.Value('string'),
52
+ 'ctx_a': datasets.Value('string'),
53
+ 'ctx_b': datasets.Value('string'),
54
+ 'ctx': datasets.Value('string'),
55
+ 'endings': datasets.features.Sequence(datasets.Value('string')),
56
+ 'source_id': datasets.Value('string'),
57
+ 'split': datasets.Value('string'),
58
+ 'split_type': datasets.Value('string'),
59
+ 'label': datasets.Value('string'),
60
+ }),
65
61
  # If there's a common (input, target) tuple from the features,
66
62
  # specify them here. They'll be used if as_supervised=True in
67
63
  # builder.as_dataset.
@@ -1,18 +1,17 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import numpy as np
2
3
  import os
3
4
  import re
4
- import numpy as np
5
5
 
6
6
  from evalscope.benchmarks.data_adapter import DataAdapter
7
7
  from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import normalize_score, jsonl_to_list
8
+ from evalscope.utils import jsonl_to_list, normalize_score
9
9
  from evalscope.utils.logger import get_logger
10
10
 
11
11
  # flake8: noqa
12
12
 
13
13
  logger = get_logger()
14
14
 
15
-
16
15
  DATASET_ID = 'modelscope/hellaswag'
17
16
  SUBSET_LIST = ['default']
18
17
 
@@ -44,12 +43,13 @@ class HellaSwagAdapter(DataAdapter):
44
43
  logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
45
44
  few_shot_num = 0
46
45
 
47
- super().__init__(subset_list=subset_list,
48
- metric_list=metric_list,
49
- few_shot_num=few_shot_num,
50
- train_split=train_split,
51
- eval_split=eval_split,
52
- **kwargs)
46
+ super().__init__(
47
+ subset_list=subset_list,
48
+ metric_list=metric_list,
49
+ few_shot_num=few_shot_num,
50
+ train_split=train_split,
51
+ eval_split=eval_split,
52
+ **kwargs)
53
53
 
54
54
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
55
55
  data_dict = {}
@@ -91,7 +91,9 @@ class HellaSwagAdapter(DataAdapter):
91
91
 
92
92
  endings: list = [self._preprocess(ending) for ending in input_d['endings']]
93
93
 
94
- few_shot_prompts = [self._generate_prompt(input_d=sample, endings=endings, include_answer=True) for sample in few_shot_list]
94
+ few_shot_prompts = [
95
+ self._generate_prompt(input_d=sample, endings=endings, include_answer=True) for sample in few_shot_list
96
+ ]
95
97
  context: str = '\n'.join(few_shot_prompts) + '\n'
96
98
  context += self._generate_prompt(input_d=input_d, endings=endings, include_answer=False)
97
99
 
@@ -124,9 +126,9 @@ class HellaSwagAdapter(DataAdapter):
124
126
 
125
127
  return str(best_choice_idx)
126
128
  elif eval_type == 'service':
127
- return result # TODO: to be supported !
129
+ return result # TODO: to be supported !
128
130
  elif eval_type == 'custom':
129
- return result # TODO: to be supported !
131
+ return result # TODO: to be supported !
130
132
  else:
131
133
  raise ValueError(f'Invalid eval_type: {eval_type}')
132
134
 
@@ -177,17 +179,19 @@ class HellaSwagAdapter(DataAdapter):
177
179
  total_num: int = sum([num for _, num in subset_score_map.values()])
178
180
  weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
179
181
  weighted_avg_acc = normalize_score(score=weighted_avg_acc)
180
- cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
181
-
182
- category_d = dict(name='DEFAULT',
183
- score=weighted_avg_acc,
184
- subset=cate_avg_list)
185
-
186
- res_map = dict(name=report_name or 'hellaswag',
187
- metric=self.metric_list[0]['name'],
188
- score=weighted_avg_acc,
189
- category=[category_d],
190
- total_num=total_num)
182
+ cate_avg_list = [{
183
+ 'name': subset_name,
184
+ 'score': normalize_score(score=score)
185
+ } for subset_name, (score, _) in subset_score_map.items()]
186
+
187
+ category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
188
+
189
+ res_map = dict(
190
+ name=report_name or 'hellaswag',
191
+ metric=self.metric_list[0]['name'],
192
+ score=weighted_avg_acc,
193
+ category=[category_d],
194
+ total_num=total_num)
191
195
 
192
196
  return res_map
193
197
 
@@ -2,4 +2,4 @@
2
2
 
3
3
  from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST
4
4
  from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
5
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -1,6 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import json
3
2
  import datasets
3
+ import json
4
+
4
5
  # flake8: noqa
5
6
 
6
7
  # NOTE: AUTOGENERATED, DO NOT CHANGE.
@@ -41,15 +42,13 @@ class OpenaiHumaneval(datasets.GeneratorBasedBuilder):
41
42
  ]
42
43
 
43
44
  def _info(self):
44
- features = datasets.Features(
45
- {
46
- 'task_id': datasets.Value('string'),
47
- 'prompt': datasets.Value('string'),
48
- 'canonical_solution': datasets.Value('string'),
49
- 'test': datasets.Value('string'),
50
- 'entry_point': datasets.Value('string'),
51
- }
52
- )
45
+ features = datasets.Features({
46
+ 'task_id': datasets.Value('string'),
47
+ 'prompt': datasets.Value('string'),
48
+ 'canonical_solution': datasets.Value('string'),
49
+ 'test': datasets.Value('string'),
50
+ 'entry_point': datasets.Value('string'),
51
+ })
53
52
 
54
53
  return datasets.DatasetInfo(
55
54
  description=_DESCRIPTION,
@@ -63,14 +62,12 @@ class OpenaiHumaneval(datasets.GeneratorBasedBuilder):
63
62
  def _split_generators(self, dl_manager):
64
63
  """Returns SplitGenerators."""
65
64
  data_dir = dl_manager.download_and_extract(_URL)
66
- return [
67
- datasets.SplitGenerator(
68
- name=datasets.Split.TEST,
69
- gen_kwargs={
70
- 'filepath': data_dir,
71
- },
72
- )
73
- ]
65
+ return [datasets.SplitGenerator(
66
+ name=datasets.Split.TEST,
67
+ gen_kwargs={
68
+ 'filepath': data_dir,
69
+ },
70
+ )]
74
71
 
75
72
  def _generate_examples(self, filepath):
76
73
  """Yields examples."""
@@ -2,7 +2,6 @@
2
2
 
3
3
  # flake8: noqa
4
4
 
5
-
6
5
  DATASET_ID = 'modelscope/humaneval'
7
6
  SUBSET_LIST = ['openai_humaneval']
8
7