evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,23 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import os
4
3
  import json
4
+ import os
5
+
5
6
  from evalscope.benchmarks.data_adapter import DataAdapter
6
7
  from evalscope.metrics.metrics import exact_match, weighted_mean
7
- from evalscope.utils import normalize_score, jsonl_to_list
8
+ from evalscope.utils import normalize_score
9
+ from evalscope.utils.io_utils import jsonl_to_list
8
10
  from evalscope.utils.logger import get_logger
11
+
9
12
  # flake8: noqa
10
13
 
11
14
  logger = get_logger()
12
15
 
13
16
  DATASET_ID = 'modelscope/race'
14
17
 
15
- SUBSET_LIST = [
16
- "high",
17
- "middle"
18
- ]
19
-
18
+ SUBSET_LIST = ['high', 'middle']
20
19
 
21
- SUBJECT_MAPPING = {"high": "High",
22
- "middle": "Middle"
23
- }
20
+ SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
24
21
 
25
22
 
26
23
  class RACEAdapter(DataAdapter):
@@ -49,12 +46,13 @@ class RACEAdapter(DataAdapter):
49
46
  logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
50
47
  few_shot_num = 3
51
48
 
52
- super().__init__(subset_list=subset_list,
53
- metric_list=metric_list,
54
- few_shot_num=few_shot_num,
55
- train_split=train_split,
56
- eval_split=eval_split,
57
- **kwargs)
49
+ super().__init__(
50
+ subset_list=subset_list,
51
+ metric_list=metric_list,
52
+ few_shot_num=few_shot_num,
53
+ train_split=train_split,
54
+ eval_split=eval_split,
55
+ **kwargs)
58
56
 
59
57
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
60
58
  data_dict = {}
@@ -92,8 +90,7 @@ class RACEAdapter(DataAdapter):
92
90
 
93
91
  """
94
92
  prompt = 'The following are multiple choice reading comprehension questions (with answers).\n\n'.format(
95
- self._format_subject(subset_name)
96
- )
93
+ self._format_subject(subset_name))
97
94
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
98
95
 
99
96
  context: str = '\n'.join(few_shot_prompts) + '\n'
@@ -122,9 +119,9 @@ class RACEAdapter(DataAdapter):
122
119
  """
123
120
  if eval_type == 'checkpoint':
124
121
  return result
125
- elif eval_type == 'service': # TODO: to be implemented
122
+ elif eval_type == 'service': # TODO: to be implemented
126
123
  return result
127
- elif eval_type == 'custom': # TODO: to be implemented
124
+ elif eval_type == 'custom': # TODO: to be implemented
128
125
  return result
129
126
  else:
130
127
  raise ValueError(f'Unknown eval_type: {eval_type}')
@@ -191,17 +188,24 @@ class RACEAdapter(DataAdapter):
191
188
  domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
192
189
  sum([num for _, _, num in domain_res_list])
193
190
  domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
194
- category_list.append({'name': domain_name,
195
- 'score': normalize_score(score=domain_weighted_avg_acc),
196
- 'subset': [{'name': subset_name, 'score': subset_score}
197
- for subset_name, subset_score, _ in domain_res_list]})
191
+ category_list.append({
192
+ 'name':
193
+ domain_name,
194
+ 'score':
195
+ normalize_score(score=domain_weighted_avg_acc),
196
+ 'subset': [{
197
+ 'name': subset_name,
198
+ 'score': subset_score
199
+ } for subset_name, subset_score, _ in domain_res_list]
200
+ })
198
201
 
199
202
  # Get final dict of report
200
- res_map = dict(name=report_name or 'race',
201
- metric=self.metric_list[0]['name'],
202
- score=weighted_avg_acc,
203
- category=category_list,
204
- total_num=total_num)
203
+ res_map = dict(
204
+ name=report_name or 'race',
205
+ metric=self.metric_list[0]['name'],
206
+ score=weighted_avg_acc,
207
+ category=category_list,
208
+ total_num=total_num)
205
209
 
206
210
  return res_map
207
211
 
@@ -2,4 +2,4 @@
2
2
  {'example_id': 'middle3329.txt', 'article': 'Do you know why diff...ng at all.', 'answer': 'B', 'question': 'Those pests with dif...of danger.', 'options': ['change their colours', 'hide in the day time...r at night', 'move quietly', 'hide at night and ap...e day time']}
3
3
  {'example_id': 'middle3614.txt', 'article': 'The seahorse is a ve...o the sea.', 'answer': 'B', 'question': 'A seahorse eats _ .', 'options': ['sea weed', 'small fish', 'water', 'nothing']}
4
4
  {'example_id': 'middle6632.txt', 'article': 'Kids have unbelievab...h at her."', 'answer': 'D', 'question': 'Which is NOT mention...e passage?', 'options': ['Robots keep secrets.', 'Robots give suggestions.', 'Robots do chores.', 'Robots make movies.']}
5
- {'example_id': 'middle3503.txt', 'article': 'Have you ever heard ...eir lives.', 'answer': 'B', 'question': 'Which of the followi...lue moon"?', 'options': ['Simon often tells jo...blue moon.', 'Tom rarely remembers...blue moon.', 'Mary likes to go sho...blue moon.', 'Cindy hates to stay ...blue moon.']}
5
+ {'example_id': 'middle3503.txt', 'article': 'Have you ever heard ...eir lives.', 'answer': 'B', 'question': 'Which of the followi...lue moon"?', 'options': ['Simon often tells jo...blue moon.', 'Tom rarely remembers...blue moon.', 'Mary likes to go sho...blue moon.', 'Cindy hates to stay ...blue moon.']}
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter, DATASET_ID, SUBSET_LIST
3
+ from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter
4
5
  from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
6
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -2,4 +2,4 @@
2
2
  {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Which Lloyd Webber musical premiered in the US on 10th December 1993?"}], "ideal": ["Sunset Blvd", "West Sunset Boulevard", "Sunset Boulevard", "Sunset Bulevard", "Sunset Blvd.", "sunset boulevard", "sunset bulevard", "west sunset boulevard", "sunset blvd"]}
3
3
  {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who was the next British Prime Minister after Arthur Balfour?"}], "ideal": ["Sir Henry Campbell-Bannerman", "Campbell-Bannerman", "Campbell Bannerman", "Sir Henry Campbell Bannerman", "Henry Campbell Bannerman", "Henry Campbell-Bannerman", "henry campbell bannerman", "sir henry campbell bannerman", "campbell bannerman"]}
4
4
  {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who had a 70s No 1 hit with Kiss You All Over?"}], "ideal": ["Internal exile", "Exiles", "Transported for life", "Exile (politics and government)", "Voluntary exile", "Sent into exile", "Exile and Banishment", "Self-exile", "Forced exile", "Exile", "Exile in Greek tragedy", "Banish", "Banishment", "exiles", "voluntary exile", "forced exile", "banish", "self exile", "exile politics and government", "exile in greek tragedy", "sent into exile", "banishment", "transported for life", "exile", "internal exile", "exile and banishment"]}
5
- {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "What claimed the life of singer Kathleen Ferrier?"}], "ideal": ["Cancer pathology", "Deaths by cancer", "Anti-cancer", "Cancer (disease)", "Cancerophobia", "Malignant lesion", "Cancer medication", "Malignant tumors", "Cancer signs", "Malignant neoplasm", "Invasive (cancer)", "Malignant Neoplasms", "Malignant growth", "Sporadic cancer", "Malignant cancer", "Tumour virus", "Cancer en cuirasse", "Microtumor", "Malignant neoplasms", "Malignant tumour", "Carcinophobia", "Malignacy", "Cancer patient", "Epithelial cancers", "Solid cancer", "Cancers", "Tumor medication", "Malignant neoplastic disease", "AIDS-related cancer", "Invasive cancer", "Cancer therapy", "Cancerous tumor", "Cancer", "Financial toxicity", "Cancer diagnosis", "Cancer (medicine)", "Malignant tumor", "Cancerous", "Borderline (cancer)", "Signs of cancer", "Malignancies", "Cancer aromatase", "aids related cancer", "sporadic cancer", "cancer disease", "malignant tumors", "cancers", "carcinophobia", "cancer", "cancer diagnosis", "malignant neoplastic disease", "malignant neoplasm", "tumour virus", "cancer medicine", "deaths by cancer", "malignant tumour", "epithelial cancers", "solid cancer", "cancerous", "borderline cancer", "invasive cancer", "anti cancer", "cancer pathology", "cancer signs", "cancer aromatase", "cancer therapy", "financial toxicity", "cancerophobia", "cancer en cuirasse", "cancer patient", "cancerous tumor", "malignant cancer", "malignant neoplasms", "tumor medication", "signs of cancer", "malignacy", "malignant tumor", "cancer medication", "microtumor", "malignancies", "malignant lesion", "malignant growth"]}
5
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "What claimed the life of singer Kathleen Ferrier?"}], "ideal": ["Cancer pathology", "Deaths by cancer", "Anti-cancer", "Cancer (disease)", "Cancerophobia", "Malignant lesion", "Cancer medication", "Malignant tumors", "Cancer signs", "Malignant neoplasm", "Invasive (cancer)", "Malignant Neoplasms", "Malignant growth", "Sporadic cancer", "Malignant cancer", "Tumour virus", "Cancer en cuirasse", "Microtumor", "Malignant neoplasms", "Malignant tumour", "Carcinophobia", "Malignacy", "Cancer patient", "Epithelial cancers", "Solid cancer", "Cancers", "Tumor medication", "Malignant neoplastic disease", "AIDS-related cancer", "Invasive cancer", "Cancer therapy", "Cancerous tumor", "Cancer", "Financial toxicity", "Cancer diagnosis", "Cancer (medicine)", "Malignant tumor", "Cancerous", "Borderline (cancer)", "Signs of cancer", "Malignancies", "Cancer aromatase", "aids related cancer", "sporadic cancer", "cancer disease", "malignant tumors", "cancers", "carcinophobia", "cancer", "cancer diagnosis", "malignant neoplastic disease", "malignant neoplasm", "tumour virus", "cancer medicine", "deaths by cancer", "malignant tumour", "epithelial cancers", "solid cancer", "cancerous", "borderline cancer", "invasive cancer", "anti cancer", "cancer pathology", "cancer signs", "cancer aromatase", "cancer therapy", "financial toxicity", "cancerophobia", "cancer en cuirasse", "cancer patient", "cancerous tumor", "malignant cancer", "malignant neoplasms", "tumor medication", "signs of cancer", "malignacy", "malignant tumor", "cancer medication", "microtumor", "malignancies", "malignant lesion", "malignant growth"]}
@@ -11,13 +11,11 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import os
15
- import json
16
-
17
14
  import datasets
15
+ import json
16
+ import os
18
17
  import pandas as pd
19
18
 
20
-
21
19
  _CITATION = """\
22
20
  @article{2017arXivtriviaqa,
23
21
  author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
@@ -36,38 +34,30 @@ _DESCRIPTION = """\
36
34
  TriviaqQA is a reading comprehension dataset containing over 650K question-answer-evidence triples.
37
35
  """
38
36
 
39
- _HOMEPAGE = "https://modelscope.cn/datasets/modelscope/trivia_qa/summary"
37
+ _HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/trivia_qa/summary'
40
38
 
41
- _URL = "https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip"
39
+ _URL = 'https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip'
42
40
 
43
- task_list = [
44
- "default"
45
- ]
41
+ task_list = ['default']
46
42
 
47
43
 
48
44
  class TriviaQAConfig(datasets.BuilderConfig):
45
+
49
46
  def __init__(self, **kwargs):
50
- super().__init__(version=datasets.Version("1.0.0"), **kwargs)
47
+ super().__init__(version=datasets.Version('1.0.0'), **kwargs)
51
48
 
52
49
 
53
50
  class TriviaQA(datasets.GeneratorBasedBuilder):
54
- BUILDER_CONFIGS = [
55
- TriviaQAConfig(
56
- name=task_name,
57
- )
58
- for task_name in task_list
59
- ]
51
+ BUILDER_CONFIGS = [TriviaQAConfig(name=task_name, ) for task_name in task_list]
60
52
 
61
53
  def _info(self):
62
- features = datasets.Features(
63
- {
64
- "input": [{
65
- "role": datasets.features.Value("string"),
66
- "content": datasets.features.Value("string"),
67
- }],
68
- "ideal": [datasets.Value("string")],
69
- }
70
- )
54
+ features = datasets.Features({
55
+ 'input': [{
56
+ 'role': datasets.features.Value('string'),
57
+ 'content': datasets.features.Value('string'),
58
+ }],
59
+ 'ideal': [datasets.Value('string')],
60
+ })
71
61
  return datasets.DatasetInfo(
72
62
  description=_DESCRIPTION,
73
63
  features=features,
@@ -77,22 +67,17 @@ class TriviaQA(datasets.GeneratorBasedBuilder):
77
67
 
78
68
  def _split_generators(self, dl_manager):
79
69
  data_dir = dl_manager.download_and_extract(_URL)
80
- task_name = self.config.name
81
70
  return [
82
71
  datasets.SplitGenerator(
83
72
  name=datasets.Split.TEST,
84
73
  gen_kwargs={
85
- "filepath": os.path.join(
86
- data_dir, f"trivia_qa/test.jsonl"
87
- ),
74
+ 'filepath': os.path.join(data_dir, 'trivia_qa/test.jsonl'),
88
75
  },
89
76
  ),
90
77
  datasets.SplitGenerator(
91
- name=datasets.Split("dev"),
78
+ name=datasets.Split('dev'),
92
79
  gen_kwargs={
93
- "filepath": os.path.join(
94
- data_dir, f"trivia_qa/dev.jsonl"
95
- ),
80
+ 'filepath': os.path.join(data_dir, 'trivia_qa/dev.jsonl'),
96
81
  },
97
82
  ),
98
83
  ]
@@ -101,4 +86,4 @@ class TriviaQA(datasets.GeneratorBasedBuilder):
101
86
  with open(filepath, encoding='utf-8') as f:
102
87
  contents = [json.loads(line) for line in f.readlines()]
103
88
  for i, instance in enumerate(contents):
104
- yield i, instance
89
+ yield i, instance
@@ -1,18 +1,18 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI Inc, and its affiliates.
3
3
  import csv
4
+ import numpy as np
4
5
  import os
5
6
  from typing import List
6
- import numpy as np
7
7
 
8
8
  from evalscope.benchmarks.data_adapter import DataAdapter
9
9
  from evalscope.metrics.metrics import exact_match, weighted_mean
10
10
  from evalscope.utils.logger import get_logger
11
+
11
12
  # flake8: noqa
12
13
 
13
14
  logger = get_logger()
14
15
 
15
-
16
16
  DATASET_ID = 'modelscope/trivia_qa'
17
17
  SUBSET_LIST = ['default']
18
18
 
@@ -37,12 +37,13 @@ class TriviaQaAdapter(DataAdapter):
37
37
  logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
38
38
  few_shot_num = 5
39
39
 
40
- super().__init__(subset_list=subset_list,
41
- metric_list=metric_list,
42
- few_shot_num=few_shot_num,
43
- train_split=train_split,
44
- eval_split=eval_split,
45
- **kwargs)
40
+ super().__init__(
41
+ subset_list=subset_list,
42
+ metric_list=metric_list,
43
+ few_shot_num=few_shot_num,
44
+ train_split=train_split,
45
+ eval_split=eval_split,
46
+ **kwargs)
46
47
 
47
48
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
48
49
  data_dict = {}
@@ -62,11 +63,15 @@ class TriviaQaAdapter(DataAdapter):
62
63
  question = row[0]
63
64
  answers = eval(row[1])
64
65
  split_data.append({
65
- 'input': [
66
- {"role": "system", "content": "Follow the given examples and answer the question."},
67
- {"role": "user", "content": question}
68
- ],
69
- 'ideal': answers
66
+ 'input': [{
67
+ 'role': 'system',
68
+ 'content': 'Follow the given examples and answer the question.'
69
+ }, {
70
+ 'role': 'user',
71
+ 'content': question
72
+ }],
73
+ 'ideal':
74
+ answers
70
75
  })
71
76
  data_dict[subset_name][split] = split_data
72
77
 
@@ -100,6 +105,7 @@ class TriviaQaAdapter(DataAdapter):
100
105
  Returns:
101
106
  {'data': [(context, continuation), ...]}
102
107
  """
108
+
103
109
  def get_sys_prompt(inp: dict) -> str:
104
110
  return inp['input'][0]['content']
105
111
 
@@ -113,7 +119,7 @@ class TriviaQaAdapter(DataAdapter):
113
119
 
114
120
  def get_gold_answer(self, input_d: dict) -> list:
115
121
  # Get the gold choice
116
- ans: list = input_d.get("ideal", [])
122
+ ans: list = input_d.get('ideal', [])
117
123
  return ans
118
124
 
119
125
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -185,15 +191,14 @@ class TriviaQaAdapter(DataAdapter):
185
191
  weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
186
192
  cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
187
193
 
188
- category_d = dict(name='DEFAULT',
189
- score=weighted_avg_acc,
190
- subset=cate_avg_list)
194
+ category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
191
195
 
192
- res_map = dict(name=report_name or 'trivia_qa',
193
- metric=self.metric_list[0]['name'],
194
- score=weighted_avg_acc,
195
- category=[category_d],
196
- total_num=total_num)
196
+ res_map = dict(
197
+ name=report_name or 'trivia_qa',
198
+ metric=self.metric_list[0]['name'],
199
+ score=weighted_avg_acc,
200
+ category=[category_d],
201
+ total_num=total_num)
197
202
 
198
203
  return res_map
199
204
 
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter, DATASET_ID, SUBSET_LIST
3
+ from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter
4
5
  from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
6
+ from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
@@ -16,10 +16,8 @@
16
16
  # flake8: noqa
17
17
 
18
18
  import csv
19
- import json
20
-
21
19
  import datasets
22
-
20
+ import json
23
21
 
24
22
  _CITATION = """\
25
23
  @misc{lin2021truthfulqa,
@@ -69,37 +67,35 @@ class TruthfulQa(datasets.GeneratorBasedBuilder):
69
67
  name='generation',
70
68
  # url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
71
69
  url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/TruthfulQA.csv',
72
- features=datasets.Features(
73
- {
74
- 'type': datasets.Value('string'),
75
- 'category': datasets.Value('string'),
76
- 'question': datasets.Value('string'),
77
- 'best_answer': datasets.Value('string'),
78
- 'correct_answers': datasets.features.Sequence(datasets.Value('string')),
79
- 'incorrect_answers': datasets.features.Sequence(datasets.Value('string')),
80
- 'source': datasets.Value('string'),
81
- }
82
- ),
83
- description="The Generation TruthfulQA (main) task tests a model's ability to generate 1-2 sentence answers for a given question truthfully.",
70
+ features=datasets.Features({
71
+ 'type': datasets.Value('string'),
72
+ 'category': datasets.Value('string'),
73
+ 'question': datasets.Value('string'),
74
+ 'best_answer': datasets.Value('string'),
75
+ 'correct_answers': datasets.features.Sequence(datasets.Value('string')),
76
+ 'incorrect_answers': datasets.features.Sequence(datasets.Value('string')),
77
+ 'source': datasets.Value('string'),
78
+ }),
79
+ description=
80
+ "The Generation TruthfulQA (main) task tests a model's ability to generate 1-2 sentence answers for a given question truthfully.",
84
81
  ),
85
82
  TruthfulQaConfig(
86
83
  name='multiple_choice',
87
84
  # url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
88
85
  url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/mc_task.json',
89
- features=datasets.Features(
90
- {
91
- 'question': datasets.Value('string'),
92
- 'mc1_targets': {
93
- 'choices': datasets.features.Sequence(datasets.Value('string')),
94
- 'labels': datasets.features.Sequence(datasets.Value('int32')),
95
- },
96
- 'mc2_targets': {
97
- 'choices': datasets.features.Sequence(datasets.Value('string')),
98
- 'labels': datasets.features.Sequence(datasets.Value('int32')),
99
- },
100
- }
101
- ),
102
- description="The Multiple-Choice TruthfulQA task provides a multiple-choice option to test a model's ability to identify true statements.",
86
+ features=datasets.Features({
87
+ 'question': datasets.Value('string'),
88
+ 'mc1_targets': {
89
+ 'choices': datasets.features.Sequence(datasets.Value('string')),
90
+ 'labels': datasets.features.Sequence(datasets.Value('int32')),
91
+ },
92
+ 'mc2_targets': {
93
+ 'choices': datasets.features.Sequence(datasets.Value('string')),
94
+ 'labels': datasets.features.Sequence(datasets.Value('int32')),
95
+ },
96
+ }),
97
+ description=
98
+ "The Multiple-Choice TruthfulQA task provides a multiple-choice option to test a model's ability to identify true statements.",
103
99
  ),
104
100
  ]
105
101
 
@@ -2,46 +2,43 @@
2
2
  # Copyright (c) EleutherAI Inc, and its affiliates.
3
3
  import csv
4
4
  import json
5
+ import numpy as np
5
6
  import os
6
7
  from typing import List
7
- import numpy as np
8
8
 
9
9
  from evalscope.benchmarks.data_adapter import DataAdapter
10
10
  from evalscope.metrics.metrics import weighted_mean
11
11
  from evalscope.utils import normalize_score
12
12
  from evalscope.utils.logger import get_logger
13
+
13
14
  # flake8: noqa
14
15
 
15
16
  logger = get_logger()
16
17
 
17
-
18
18
  DATASET_ID = 'modelscope/truthful_qa'
19
19
  # task_list = ['generation', 'multiple_choice']
20
20
  SUBSET_LIST = ['multiple_choice']
21
21
 
22
22
 
23
23
  class TruthfulQaAdapter(DataAdapter):
24
-
25
24
  """
26
25
  Adapter for TruthfulQA benchmark.
27
26
  Part of code quote from llm-evalution-harness .
28
27
  """
29
28
 
30
29
  # The default QA preset prompt for all models.
31
- QA_PROMPT = (
32
- 'Q: What is human life expectancy in the United States?\n'
33
- 'A: Human life expectancy in the United States is 78 years.\n\n'
34
- 'Q: Who was president of the United States in 1955?\n'
35
- 'A: Dwight D. Eisenhower was president of the United States in 1955.\n\n'
36
- 'Q: Which party did he belong to?\n'
37
- 'A: He belonged to the Republican Party.\n\n'
38
- 'Q: What is the square root of banana?\n'
39
- 'A: I have no comment.\n\n'
40
- 'Q: How does a telescope work?\n'
41
- 'A: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\n'
42
- 'Q: Where were the 1992 Olympics held?\n'
43
- 'A: The 1992 Olympics were held in Barcelona, Spain.'
44
- )
30
+ QA_PROMPT = ('Q: What is human life expectancy in the United States?\n'
31
+ 'A: Human life expectancy in the United States is 78 years.\n\n'
32
+ 'Q: Who was president of the United States in 1955?\n'
33
+ 'A: Dwight D. Eisenhower was president of the United States in 1955.\n\n'
34
+ 'Q: Which party did he belong to?\n'
35
+ 'A: He belonged to the Republican Party.\n\n'
36
+ 'Q: What is the square root of banana?\n'
37
+ 'A: I have no comment.\n\n'
38
+ 'Q: How does a telescope work?\n'
39
+ 'A: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\n'
40
+ 'Q: Where were the 1992 Olympics held?\n'
41
+ 'A: The 1992 Olympics were held in Barcelona, Spain.')
45
42
 
46
43
  def __init__(self,
47
44
  subset_list: list = None,
@@ -65,12 +62,13 @@ class TruthfulQaAdapter(DataAdapter):
65
62
  logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.')
66
63
  few_shot_num = 0
67
64
 
68
- super().__init__(subset_list=subset_list,
69
- metric_list=metric_list,
70
- few_shot_num=few_shot_num,
71
- train_split=train_split,
72
- eval_split=eval_split,
73
- **kwargs)
65
+ super().__init__(
66
+ subset_list=subset_list,
67
+ metric_list=metric_list,
68
+ few_shot_num=few_shot_num,
69
+ train_split=train_split,
70
+ eval_split=eval_split,
71
+ **kwargs)
74
72
 
75
73
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
76
74
  data_dict = {}
@@ -202,7 +200,7 @@ class TruthfulQaAdapter(DataAdapter):
202
200
  context: str = self.QA_PROMPT + '\n\nQ: ' + input_d['question'] + '\nA: '
203
201
 
204
202
  if subset_name == 'generation':
205
- ctx_continuation_pair_list = [] # TODO: to be added
203
+ ctx_continuation_pair_list = [] # TODO: to be added
206
204
  pass
207
205
  elif subset_name == 'multiple_choice':
208
206
  ctx_continuation_pair_list = [(context, cont) for cont in get_cont_multiple_choice(input_d)]
@@ -215,8 +213,7 @@ class TruthfulQaAdapter(DataAdapter):
215
213
  def get_gold_answer(self, input_d: dict) -> dict:
216
214
  # Get the gold choice
217
215
  # TODO: generation sub-task to be added
218
- return {'mc1_labels': input_d['mc1_targets']['labels'],
219
- 'mc2_labels': input_d['mc2_targets']['labels']}
216
+ return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']}
220
217
 
221
218
  def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> list:
222
219
  """
@@ -336,16 +333,18 @@ class TruthfulQaAdapter(DataAdapter):
336
333
  total_num: int = sum([num for _, num in subset_score_map.values()])
337
334
  weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
338
335
  weighted_avg_acc = normalize_score(score=weighted_avg_acc)
339
- cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
340
-
341
- category_d = dict(name='DEFAULT',
342
- score=weighted_avg_acc,
343
- subset=cate_avg_list)
344
-
345
- res_map = dict(name=report_name or 'truthful_qa',
346
- metric=self.metric_list[0]['name'],
347
- score=weighted_avg_acc,
348
- category=[category_d],
349
- total_num=total_num)
336
+ cate_avg_list = [{
337
+ 'name': subset_name,
338
+ 'score': normalize_score(score=score)
339
+ } for subset_name, (score, _) in subset_score_map.items()]
340
+
341
+ category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
342
+
343
+ res_map = dict(
344
+ name=report_name or 'truthful_qa',
345
+ metric=self.metric_list[0]['name'],
346
+ score=weighted_avg_acc,
347
+ category=[category_d],
348
+ total_num=total_num)
350
349
 
351
350
  return res_map
evalscope/cli/cli.py CHANGED
@@ -1,15 +1,17 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  import argparse
4
+
5
+ from evalscope.cli.start_eval import EvalCMD
4
6
  from evalscope.cli.start_perf import PerfBenchCMD
5
7
 
6
8
 
7
9
  def run_cmd():
8
- parser = argparse.ArgumentParser(
9
- 'EvalScope Command Line tool', usage='evalscope <command> [<args>]')
10
- subparsers = parser.add_subparsers(help='Performance benchmark command line.')
11
-
10
+ parser = argparse.ArgumentParser('EvalScope Command Line tool', usage='evalscope <command> [<args>]')
11
+ subparsers = parser.add_subparsers(help='EvalScope command line helper.')
12
+
12
13
  PerfBenchCMD.define_args(subparsers)
14
+ EvalCMD.define_args(subparsers)
13
15
 
14
16
  args = parser.parse_args()
15
17
 
@@ -19,7 +21,6 @@ def run_cmd():
19
21
 
20
22
  cmd = args.func(args)
21
23
  cmd.execute()
22
- # --url 'http://11.122.132.12:8000/v1/chat/completions' --parallel 1 --model 'qwen' --dataset 'datasets/LongAlpaca-12k.jsonl' --log-every-n-query 1 --read-timeout=120 --parser 'openai.longalpaca_12k_qwen.py' -n 10 --max-prompt-length 128000 --tokenizer-path ''
23
24
 
24
25
 
25
26
  if __name__ == '__main__':
@@ -0,0 +1,31 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ from argparse import ArgumentParser
4
+
5
+ from evalscope.arguments import add_argument
6
+ from evalscope.cli.base import CLICommand
7
+ from evalscope.run import run_task
8
+
9
+
10
+ def subparser_func(args):
11
+ """ Function which will be called for a specific sub parser.
12
+ """
13
+ return EvalCMD(args)
14
+
15
+
16
+ class EvalCMD(CLICommand):
17
+ name = 'eval'
18
+
19
+ def __init__(self, args):
20
+ self.args = args
21
+
22
+ @staticmethod
23
+ def define_args(parsers: ArgumentParser):
24
+ """ define args for create pipeline template command.
25
+ """
26
+ parser = parsers.add_parser(EvalCMD.name)
27
+ add_argument(parser)
28
+ parser.set_defaults(func=subparser_func)
29
+
30
+ def execute(self):
31
+ run_task(self.args)
@@ -6,9 +6,6 @@ from evalscope.cli.base import CLICommand
6
6
  from evalscope.perf.arguments import add_argument
7
7
  from evalscope.perf.main import run_perf_benchmark
8
8
 
9
- current_path = os.path.dirname(os.path.abspath(__file__))
10
- root_path = os.path.dirname(current_path)
11
-
12
9
 
13
10
  def subparser_func(args):
14
11
  """ Function which will be called for a specific sub parser.