evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,17 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from typing import List, Union
4
-
5
3
  import pandas as pd
6
4
  import pyarrow as pa
5
+ from typing import List, Union
7
6
 
8
7
  from evalscope.constants import MetricMembers
9
8
  from evalscope.utils.arena_utils import compute_elo
9
+ from evalscope.utils.io_utils import jsonl_to_list
10
10
  from evalscope.utils.logger import get_logger
11
- from evalscope.utils import jsonl_to_list
12
11
 
13
12
  logger = get_logger()
14
13
 
15
- DEFAULT_COLUMNS_MAPPING = {
16
- 'model_a': 'model_a',
17
- 'model_b': 'model_b',
18
- 'win': 'win',
19
- 'tstamp': 'ts',
20
- 'language': 'lang'
21
- }
14
+ DEFAULT_COLUMNS_MAPPING = {'model_a': 'model_a', 'model_b': 'model_b', 'win': 'win', 'tstamp': 'ts', 'language': 'lang'}
22
15
 
23
16
 
24
17
  class RatingEvaluate(object):
@@ -41,10 +34,9 @@ class RatingEvaluate(object):
41
34
  elo_ratings = compute_elo(battles)
42
35
  col_model = 'Model'
43
36
  col_elo_rating = 'Elo_Rating'
44
- elo_ratings_res = pd.DataFrame(
45
- [[n, elo_ratings[n]] for n in elo_ratings.keys()],
46
- columns=[col_model, col_elo_rating]).sort_values(
47
- col_elo_rating, ascending=False).reset_index(drop=True)
37
+ elo_ratings_res = pd.DataFrame([[n, elo_ratings[n]] for n in elo_ratings.keys()],
38
+ columns=[col_model, col_elo_rating]).sort_values(
39
+ col_elo_rating, ascending=False).reset_index(drop=True)
48
40
  elo_ratings_res = elo_ratings_res.round({col_elo_rating: 1})
49
41
  return elo_ratings_res
50
42
 
@@ -89,23 +81,11 @@ class RatingEvaluate(object):
89
81
  'tie': 1
90
82
  }]
91
83
  else:
92
- return [{
93
- 'model': winner,
94
- 'win': 1,
95
- 'loss': 0,
96
- 'tie': 0
97
- }, {
98
- 'model': loser,
99
- 'win': 0,
100
- 'loss': 1,
101
- 'tie': 0
102
- }]
84
+ return [{'model': winner, 'win': 1, 'loss': 0, 'tie': 0}, {'model': loser, 'win': 0, 'loss': 1, 'tie': 0}]
103
85
 
104
86
  def compute_pairwise_rating(self, raw_data):
105
87
  df_all = self.preprocess(raw_data_df=raw_data)
106
- model_list = (
107
- df_all['model_a'].unique().tolist()
108
- + df_all['model_b'].unique().tolist())
88
+ model_list = (df_all['model_a'].unique().tolist() + df_all['model_b'].unique().tolist())
109
89
  model_list = list(set(model_list))
110
90
 
111
91
  list_res = []
@@ -114,8 +94,7 @@ class RatingEvaluate(object):
114
94
  if self.baseline_model is not None:
115
95
  if self.baseline_model not in [row['model_a'], row['model_b']]:
116
96
  logger.warning(
117
- f'One of the models in the battle should be the baseline model: {self.baseline_model}'
118
- )
97
+ f'One of the models in the battle should be the baseline model: {self.baseline_model}')
119
98
  continue
120
99
  rating = self.get_single_pairwise_rating(row)
121
100
  list_res = list_res + rating
@@ -149,15 +128,15 @@ class RatingEvaluate(object):
149
128
 
150
129
  for metric in self.metrics:
151
130
 
152
- if metric == MetricMembers.ELO.value:
131
+ if metric == MetricMembers.ELO:
153
132
  res = self.compute_elo_rating(raw_data)
154
133
  res_all.append(res)
155
134
 
156
- elif metric == MetricMembers.PAIRWISE.value:
135
+ elif metric == MetricMembers.PAIRWISE:
157
136
  res = self.compute_pairwise_rating(raw_data)
158
137
  res_all.append(res)
159
138
 
160
- elif metric == MetricMembers.SCORE.value:
139
+ elif metric == MetricMembers.SCORE:
161
140
  res = self.compute_score_rating(raw_data)
162
141
  res_all.append(res)
163
142
 
@@ -2,6 +2,7 @@
2
2
  # flake8: noqa
3
3
 
4
4
  import os
5
+ import pandas as pd
5
6
  import random
6
7
  import sys
7
8
  import time
@@ -9,15 +10,11 @@ from abc import ABC, abstractmethod
9
10
  from functools import partial
10
11
  from typing import Any, List
11
12
 
12
- import pandas as pd
13
-
14
13
  from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
15
14
  from evalscope.models.openai_model import OpenAIModel
16
- from evalscope.utils import completion_parsers
17
- from evalscope.utils.arena_utils import (get_battle_pairs,
18
- merge_ques_ans,
19
- shuffle_pairwise_preferences)
20
- from evalscope.utils import dump_jsonl_data, jsonl_to_list, random_seeded_choice
15
+ from evalscope.utils import completion_parsers, random_seeded_choice
16
+ from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
17
+ from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
21
18
  from evalscope.utils.logger import get_logger
22
19
 
23
20
  logger = get_logger()
@@ -33,8 +30,7 @@ class BaseReviewer(ABC):
33
30
  """
34
31
  Run pairwise battles with given models.
35
32
  """
36
- raise NotImplementedError(
37
- 'run() method must be implemented in your subclass.')
33
+ raise NotImplementedError('run() method must be implemented in your subclass.')
38
34
 
39
35
 
40
36
  class AutoReviewerGpt4(BaseReviewer):
@@ -71,13 +67,9 @@ class AutoReviewerGpt4(BaseReviewer):
71
67
 
72
68
  self.review_result_file = review_result_file
73
69
  self.prompt_list = jsonl_to_list(prompt_file)
74
- self.answer_list = [
75
- jsonl_to_list(answer_file) for answer_file in answer_file_list
76
- ]
77
- self.reference_list = jsonl_to_list(
78
- reference_file) if reference_file else []
79
- self.cache_list = jsonl_to_list(
80
- cache_file) if cache_file and os.path.isfile(cache_file) else []
70
+ self.answer_list = [jsonl_to_list(answer_file) for answer_file in answer_file_list]
71
+ self.reference_list = jsonl_to_list(reference_file) if reference_file else []
72
+ self.cache_list = jsonl_to_list(cache_file) if cache_file and os.path.isfile(cache_file) else []
81
73
 
82
74
  self.reviewer_args = reviewer_args if reviewer_args \
83
75
  else self._get_default_args()
@@ -88,24 +80,18 @@ class AutoReviewerGpt4(BaseReviewer):
88
80
  self.answer_list.append(jsonl_to_list(baseline_file))
89
81
  self.baseline_idx = len(self.answer_list) - 1
90
82
 
91
- self.position_bias_mitigation = self.reviewer_args.pop(
92
- EvalConfigKeys.POSITION_BIAS_MITIGATION,
93
- PositionBiasMitigation.NONE)
83
+ self.position_bias_mitigation = self.reviewer_args.pop(EvalConfigKeys.POSITION_BIAS_MITIGATION,
84
+ PositionBiasMitigation.NONE)
94
85
  if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
95
- self.random_seed = self.reviewer_args.pop(
96
- EvalConfigKeys.RANDOM_SEED, 123)
97
-
98
- fn_completion_parser = self.reviewer_args.pop(
99
- EvalConfigKeys.FN_COMPLETION_PARSER,
100
- FnCompletionParser.LMSYS_PARSER)
101
- completion_parser_kwargs = self.reviewer_args.pop(
102
- EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
86
+ self.random_seed = self.reviewer_args.pop(EvalConfigKeys.RANDOM_SEED, 123)
87
+
88
+ fn_completion_parser = self.reviewer_args.pop(EvalConfigKeys.FN_COMPLETION_PARSER,
89
+ FnCompletionParser.LMSYS_PARSER)
90
+ completion_parser_kwargs = self.reviewer_args.pop(EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
103
91
  if isinstance(fn_completion_parser, str):
104
- fn_completion_parser = getattr(completion_parsers,
105
- fn_completion_parser)
92
+ fn_completion_parser = getattr(completion_parsers, fn_completion_parser)
106
93
 
107
- self.fn_completion_parser = partial(fn_completion_parser,
108
- **completion_parser_kwargs)
94
+ self.fn_completion_parser = partial(fn_completion_parser, **completion_parser_kwargs)
109
95
  self.gpt_predictor = OpenAIModel(model_cfg=self.reviewer_args)
110
96
 
111
97
  @staticmethod
@@ -133,45 +119,35 @@ class AutoReviewerGpt4(BaseReviewer):
133
119
  # Default to general category (idx 0)
134
120
  target_prompt_dict = prompts_list[0]
135
121
  for item in prompts_list:
136
- is_category_match = category in item['category'] if isinstance(
137
- item['category'], list) else item['category'] == category
122
+ is_category_match = category in item['category'] if isinstance(item['category'],
123
+ list) else item['category'] == category
138
124
  is_type_match = item.get('type', ArenaMode.PAIRWISE) == type
139
125
  if is_category_match and is_type_match:
140
126
  target_prompt_dict = item
141
127
  break
142
- elif is_type_match and target_prompt_dict.get('type',
143
- ArenaMode.PAIRWISE) != type:
128
+ elif is_type_match and target_prompt_dict.get('type', ArenaMode.PAIRWISE) != type:
144
129
  target_prompt_dict = item # fallback to type match
145
130
 
146
131
  sys_prompt = target_prompt_dict['system_prompt']
147
132
  prompt_template = target_prompt_dict['prompt_template']
148
133
  defaults = target_prompt_dict.get('defaults', dict({}))
149
- output_format = target_prompt_dict.get('output_format',
150
- '[[rating_a,rating_b]]')
134
+ output_format = target_prompt_dict.get('output_format', '[[rating_a,rating_b]]')
151
135
 
152
136
  if type == ArenaMode.SINGLE:
153
- user_prompt = prompt_template.format(
154
- question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
137
+ user_prompt = prompt_template.format(question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
155
138
  else:
156
139
  user_prompt = prompt_template.format(
157
- question=ques,
158
- answer_a=ans1,
159
- answer_b=ans2,
160
- ref_answer_1=ans_ref,
161
- **defaults)
140
+ question=ques, answer_a=ans1, answer_b=ans2, ref_answer_1=ans_ref, **defaults)
162
141
 
163
142
  return sys_prompt, user_prompt, output_format
164
143
 
165
144
  def get_review_cache(self, model_a, model_b, question) -> list:
166
145
  if model_b:
167
- cache_hit = next(
168
- (r for r in self.cache_list if r['model_a'] == model_a
169
- and r['model_b'] == model_b and r['question'] == question),
170
- None)
146
+ cache_hit = next((r for r in self.cache_list
147
+ if r['model_a'] == model_a and r['model_b'] == model_b and r['question'] == question),
148
+ None)
171
149
  else:
172
- cache_hit = next(
173
- (r for r in self.cache_list
174
- if r['model'] == model_a and r['question'] == question), None)
150
+ cache_hit = next((r for r in self.cache_list if r['model'] == model_a and r['question'] == question), None)
175
151
  return cache_hit
176
152
 
177
153
  def get_review_pair(self, item: List[dict], dry_run=False, **kwargs) -> dict:
@@ -265,12 +241,10 @@ class AutoReviewerGpt4(BaseReviewer):
265
241
  return review_result
266
242
 
267
243
  def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
268
- input_msg = dict(
269
- ques=question, category=category, ans1=ans1, ans2=ans2)
244
+ input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
270
245
 
271
246
  if self.reference_list:
272
- ans_ref = next((ref for ref in self.reference_list
273
- if ref.get('text') == question), None)
247
+ ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
274
248
  assert ans_ref['answer']
275
249
  input_msg['ans_ref'] = ans_ref['answer']
276
250
 
@@ -284,8 +258,7 @@ class AutoReviewerGpt4(BaseReviewer):
284
258
  else:
285
259
  review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
286
260
 
287
- result = self.fn_completion_parser(
288
- review_text, output_format=output_format)
261
+ result = self.fn_completion_parser(review_text, output_format=output_format)
289
262
  if not isinstance(result, tuple):
290
263
  result = (result, None)
291
264
  return review_text, *result
@@ -294,8 +267,7 @@ class AutoReviewerGpt4(BaseReviewer):
294
267
  input_msg = dict(ques=question, category=category, ans1=answer)
295
268
 
296
269
  if self.reference_list:
297
- ans_ref = next((ref for ref in self.reference_list
298
- if ref.get('text') == question), None)
270
+ ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
299
271
  assert ans_ref['answer']
300
272
  input_msg['ans_ref'] = ans_ref['answer']
301
273
 
@@ -312,8 +284,7 @@ class AutoReviewerGpt4(BaseReviewer):
312
284
  score = self.fn_completion_parser(review_text, output_format)
313
285
  return review_text, score
314
286
 
315
- def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str,
316
- output_format) -> str:
287
+ def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str, output_format) -> str:
317
288
  logger.info('Get dummy scores for input prompt ...')
318
289
  if output_format == '[[rating]]':
319
290
  return f'[[{round(random.random(), 2)}]]'
@@ -359,8 +330,7 @@ class AutoReviewerGpt4(BaseReviewer):
359
330
  if self.review_mode == ArenaMode.PAIRWISE:
360
331
  battle_pairs = get_battle_pairs(merged_ans_df.columns)
361
332
  elif self.review_mode == ArenaMode.PAIRWISE_BASELINE:
362
- battle_pairs = get_battle_pairs(merged_ans_df.columns,
363
- self.baseline_idx)
333
+ battle_pairs = get_battle_pairs(merged_ans_df.columns, self.baseline_idx)
364
334
  elif self.review_mode == ArenaMode.SINGLE:
365
335
  battle_pairs = [(col, ) for col in merged_ans_df.columns]
366
336
  else:
@@ -373,14 +343,12 @@ class AutoReviewerGpt4(BaseReviewer):
373
343
  pair_df.columns = ['output_1', 'output_2']
374
344
  pair_df['is_switched_outputs'] = pair_df.apply(
375
345
  lambda x: random_seeded_choice(
376
- seed='is_switched_outputs' + x[0]['text'] + str(
377
- self.random_seed),
346
+ seed='is_switched_outputs' + x[0]['text'] + str(self.random_seed),
378
347
  choices=[False, True],
379
348
  ),
380
349
  axis=1,
381
350
  )
382
- pair_df = shuffle_pairwise_preferences(
383
- pair_df, pair_df['is_switched_outputs'])
351
+ pair_df = shuffle_pairwise_preferences(pair_df, pair_df['is_switched_outputs'])
384
352
 
385
353
  for index, row in pair_df.iterrows():
386
354
  row_result = self.get_review_pair(row.to_list(), dry_run=dry_run, **kwargs) \
@@ -395,17 +363,21 @@ if __name__ == '__main__':
395
363
 
396
364
  work_path = os.path.join(Path(__file__).absolute().parent, '../../../')
397
365
  prompt_template_path = os.path.join(work_path, 'evalscope/registry/data/prompt_template/prompt_templates.jsonl')
398
- answer_file_list = [os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
399
- os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')]
366
+ answer_file_list = [
367
+ os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
368
+ os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')
369
+ ]
400
370
  review_result_file_path = os.path.join(work_path, 'outputs/arena/default/reviews/review_gpt4.jsonl')
401
371
 
402
- input_kwargs = dict(prompt_file=prompt_template_path,
403
- answer_file_list=answer_file_list,
404
- review_result_file=review_result_file_path,
405
- reviewer_args={},
406
- baseline_file='',
407
- reference_file='',
408
- cache_file='', )
372
+ input_kwargs = dict(
373
+ prompt_file=prompt_template_path,
374
+ answer_file_list=answer_file_list,
375
+ review_result_file=review_result_file_path,
376
+ reviewer_args={},
377
+ baseline_file='',
378
+ reference_file='',
379
+ cache_file='',
380
+ )
409
381
 
410
382
  auto_reviewer = AutoReviewerGpt4(**input_kwargs)
411
383
  auto_reviewer.run(dry_run=True)
@@ -29,16 +29,17 @@ In these examples settings.xml lists input files and formats.
29
29
  """
30
30
 
31
31
  from __future__ import absolute_import, division, print_function
32
- import collections
33
- import re
34
- import os
35
32
 
33
+ import collections
36
34
  import nltk
37
35
  import numpy as np
36
+ import os
37
+ import re
38
38
  import six
39
39
  from absl import logging
40
40
  from rouge_score import scoring, tokenizers
41
41
  from six.moves import map, range
42
+
42
43
  from evalscope.utils import get_logger
43
44
 
44
45
  logger = get_logger()
@@ -81,11 +82,7 @@ class RougeScorer(scoring.BaseScorer):
81
82
  ... 'The quick brown dog jumps on the log.')
82
83
  """
83
84
 
84
- def __init__(self,
85
- rouge_types,
86
- use_stemmer=False,
87
- split_summaries=False,
88
- tokenizer=None):
85
+ def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
89
86
 
90
87
  self.rouge_types = rouge_types
91
88
  if tokenizer:
@@ -160,21 +157,15 @@ class RougeScorer(scoring.BaseScorer):
160
157
  sents = [x for x in sents if len(x)]
161
158
  return sents
162
159
 
163
- target_tokens_list = [
164
- self._tokenizer.tokenize(s) for s in get_sents(target)
165
- ]
166
- prediction_tokens_list = [
167
- self._tokenizer.tokenize(s) for s in get_sents(prediction)
168
- ]
160
+ target_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(target)]
161
+ prediction_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(prediction)]
169
162
 
170
- scores = _summary_level_lcs(target_tokens_list,
171
- prediction_tokens_list)
163
+ scores = _summary_level_lcs(target_tokens_list, prediction_tokens_list)
172
164
  elif re.match(r'rouge[0-9]$', six.ensure_str(rouge_type)):
173
165
  # Rouge from n-grams.
174
166
  n = int(rouge_type[5:])
175
167
  if n <= 0:
176
- raise ValueError('rougen requires positive n: %s'
177
- % rouge_type)
168
+ raise ValueError('rougen requires positive n: %s' % rouge_type)
178
169
  target_ngrams = _create_ngrams(target_tokens, n)
179
170
  prediction_ngrams = _create_ngrams(prediction_tokens, n)
180
171
  scores = _score_ngrams(target_ngrams, prediction_ngrams)
@@ -349,8 +340,7 @@ def _score_ngrams(target_ngrams, prediction_ngrams):
349
340
 
350
341
  intersection_ngrams_count = 0
351
342
  for ngram in six.iterkeys(target_ngrams):
352
- intersection_ngrams_count += min(target_ngrams[ngram],
353
- prediction_ngrams[ngram])
343
+ intersection_ngrams_count += min(target_ngrams[ngram], prediction_ngrams[ngram])
354
344
  target_ngrams_count = sum(target_ngrams.values())
355
345
  prediction_ngrams_count = sum(prediction_ngrams.values())
356
346
 
@@ -4,7 +4,6 @@ import inspect
4
4
  import re
5
5
  import signal
6
6
  from collections import defaultdict
7
-
8
7
  from tqdm import tqdm
9
8
 
10
9
 
@@ -20,8 +19,7 @@ def check_input(text, arg):
20
19
  code_block = code_block_pattern.search(text)
21
20
  code_string = code_block.group(1)
22
21
 
23
- function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(',
24
- re.DOTALL)
22
+ function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(', re.DOTALL)
25
23
  function_name_block = function_name_pattern.search(code_string)
26
24
  function_name = function_name_block.group(1)
27
25
 
@@ -52,9 +50,7 @@ def exec_func(func, arr):
52
50
 
53
51
 
54
52
  def compute_pass_k_one_sample(predict, func_args, func_outputs, k=4):
55
- assert len(
56
- predict
57
- ) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
53
+ assert len(predict) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
58
54
  for predict_i in predict[:k]:
59
55
  try:
60
56
  for arg, gold in zip(func_args, func_outputs):
@@ -87,9 +83,7 @@ def compute_pass_k(predict_l, reference_l, func_args_l, k=4, lang='py'):
87
83
  def run_code_eval(data_l, k=4, md_level=2):
88
84
  print(f"{'#' * md_level} Code Eval(pass@{k})")
89
85
  for data in tqdm(data_l):
90
- data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'],
91
- data['func_args'],
92
- data['func_outputs'], k)
86
+ data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'], data['func_args'], data['func_outputs'], k)
93
87
  task_data_d = defaultdict(list)
94
88
  for data in data_l:
95
89
  for task in data['task_tags']:
@@ -2,7 +2,6 @@
2
2
 
3
3
  import re
4
4
  from collections import defaultdict
5
-
6
5
  from tqdm import tqdm
7
6
 
8
7
  from evalscope.constants import MetricsConstant
@@ -44,8 +43,7 @@ def compute_math_accuracy(predict_l, reference_l):
44
43
  def run_math_eval(data_l, md_level=2):
45
44
  print(f"{'#' * md_level} Math Eval(math accuracy)")
46
45
  for data in tqdm(data_l):
47
- data['math_accuracy'] = compute_math_accuracy_one_sample(
48
- data['gen'], data['target'])
46
+ data['math_accuracy'] = compute_math_accuracy_one_sample(data['gen'], data['target'])
49
47
  task_data_d = defaultdict(list)
50
48
  for data in data_l:
51
49
  for task in data['task_tags']:
@@ -54,7 +52,6 @@ def run_math_eval(data_l, md_level=2):
54
52
  print(f'[total], count: {len(data_l)}, math accuracy: '
55
53
  f'{correct_cnt / len(data_l) * 100:0.2f}%')
56
54
  for task in task_data_d.keys():
57
- correct_cnt = sum(
58
- [data['math_accuracy'] for data in task_data_d[task]])
55
+ correct_cnt = sum([data['math_accuracy'] for data in task_data_d[task]])
59
56
  print(f'[{task}], count: {len(task_data_d[task])}, math accuracy: '
60
- f'{correct_cnt/len(task_data_d[task])*100:0.2f}%')
57
+ f'{correct_cnt / len(task_data_d[task]) * 100:0.2f}%')
@@ -2,18 +2,17 @@
2
2
  # Copyright (c) EleutherAI. and its affiliates.
3
3
  # Copyright (c) OpenAI. and its affiliates.
4
4
  import itertools
5
- import math
6
- from collections.abc import Iterable
7
- from collections import defaultdict
8
- from typing import Dict, List, Union
9
- from nltk.translate.bleu_score import sentence_bleu
10
- from nltk import word_tokenize
11
5
  import jieba
12
-
6
+ import math
13
7
  import numpy as np
8
+ import random
14
9
  import sacrebleu
15
10
  import sklearn.metrics
16
- import random
11
+ from collections import defaultdict
12
+ from collections.abc import Iterable
13
+ from nltk import word_tokenize
14
+ from nltk.translate.bleu_score import sentence_bleu
15
+ from typing import Dict, List, Union
17
16
 
18
17
 
19
18
  def mean(arr):
@@ -22,12 +21,12 @@ def mean(arr):
22
21
 
23
22
  def pop_stddev(arr):
24
23
  mu = mean(arr)
25
- return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
24
+ return math.sqrt(sum([(x - mu)**2 for x in arr]) / len(arr))
26
25
 
27
26
 
28
27
  def sample_stddev(arr):
29
28
  mu = mean(arr)
30
- return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
29
+ return math.sqrt(sum([(x - mu)**2 for x in arr]) / (len(arr) - 1))
31
30
 
32
31
 
33
32
  def mean_stderr(arr):
@@ -134,13 +133,14 @@ def bleu(items):
134
133
  refs, preds = _sacreformat(refs, preds)
135
134
  return sacrebleu.corpus_bleu(preds, refs).score
136
135
 
136
+
137
137
  def bleu_ngram_one_sample(predict, reference):
138
138
  """
139
139
  Calculate BLEU-1, BLEU-2, BLEU-3, and BLEU-4 scores
140
140
 
141
141
  Args:
142
142
  items: [(ref, pred)]
143
-
143
+
144
144
  Returns:
145
145
  {
146
146
  'bleu-1': 0.8,
@@ -150,6 +150,7 @@ def bleu_ngram_one_sample(predict, reference):
150
150
  }
151
151
 
152
152
  """
153
+
153
154
  def is_contains_chinese(strs):
154
155
  for _char in strs:
155
156
  if '\u4e00' <= _char <= '\u9fa5':
@@ -230,6 +231,7 @@ def _sacreformat(refs, preds):
230
231
 
231
232
 
232
233
  class _bootstrap_internal:
234
+
233
235
  def __init__(self, f, n):
234
236
  self.f = f
235
237
  self.n = n
@@ -260,11 +262,11 @@ def bootstrap_stderr(f, xs, iters):
260
262
 
261
263
  print('bootstrapping for stddev:', f.__name__)
262
264
  for bootstrap in tqdm(
263
- pool.imap(
264
- _bootstrap_internal(f, chunk_size),
265
- [(i, xs) for i in range(iters // chunk_size)],
266
- ),
267
- total=iters // chunk_size,
265
+ pool.imap(
266
+ _bootstrap_internal(f, chunk_size),
267
+ [(i, xs) for i in range(iters // chunk_size)],
268
+ ),
269
+ total=iters // chunk_size,
268
270
  ):
269
271
  # sample w replacement
270
272
  res.extend(bootstrap)
@@ -372,11 +374,9 @@ def calculate_arc_accuracy(question_answers: Dict[str, str], predictions: Dict[s
372
374
  return score / len(question_answers)
373
375
 
374
376
 
375
- def calculate_pass_at_k(
376
- num_samples: Union[int, List[int], np.ndarray],
377
- num_correct: Union[List[int], np.ndarray],
378
- k: int = 1
379
- ) -> np.ndarray:
377
+ def calculate_pass_at_k(num_samples: Union[int, List[int], np.ndarray],
378
+ num_correct: Union[List[int], np.ndarray],
379
+ k: int = 1) -> np.ndarray:
380
380
  """
381
381
  Estimates pass@k of each problem and returns them in an array.
382
382
  Examples:
@@ -1,18 +1,16 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ import jieba
3
4
  import logging
4
5
  from collections import defaultdict
5
6
  from pathlib import Path
7
+ from rouge_chinese import Rouge
6
8
  from statistics import mean
7
-
8
9
  from tqdm import tqdm
9
10
 
10
11
  from evalscope.constants import MetricsConstant
11
12
  from evalscope.metrics.bundled_rouge_score import rouge_scorer
12
13
 
13
- from rouge_chinese import Rouge
14
- import jieba
15
-
16
14
 
17
15
  class DummyTokenizer:
18
16
 
@@ -24,9 +22,7 @@ HERE = Path(__file__).absolute().parent
24
22
 
25
23
  logger = logging.getLogger(__name__)
26
24
 
27
- scorer = rouge_scorer.RougeScorer(
28
- ['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer()
29
- )
25
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
30
26
  zh_scorer = Rouge()
31
27
 
32
28
 
@@ -52,11 +48,7 @@ def compute_rouge_score(predict_l, reference_l):
52
48
  result[rouge_key].append(one_sample[rouge_key])
53
49
  rlt = {}
54
50
  for rouge_key in MetricsConstant.ROUGE_KEYS:
55
- rlt[rouge_key] = (
56
- mean(result[rouge_key]) * 100
57
- if rouge_key in result
58
- else MetricsConstant.INVALID_VALUE
59
- )
51
+ rlt[rouge_key] = (mean(result[rouge_key]) * 100 if rouge_key in result else MetricsConstant.INVALID_VALUE)
60
52
  return rlt
61
53
 
62
54
 
@@ -111,9 +103,9 @@ def _to_table(final_result) -> str:
111
103
  if not task:
112
104
  continue
113
105
  elif task == 'total':
114
- row.append(f'{final_result["total"]["rouge"][rouge_key] :0.2f}')
106
+ row.append(f'{final_result["total"]["rouge"][rouge_key]:0.2f}')
115
107
  else:
116
- row.append(f'{final_result["tasks"][task]["rouge"][rouge_key] :0.2f}')
108
+ row.append(f'{final_result["tasks"][task]["rouge"][rouge_key]:0.2f}')
117
109
  table.append('\t'.join(row))
118
110
 
119
111
  return '\n'.join(table)
@@ -122,23 +114,17 @@ def _to_table(final_result) -> str:
122
114
  def run_rouge_eval(data_l, md_level=2, report_metric_key='rouge-l-f'):
123
115
  print(f"{'#' * md_level} Rouge Eval")
124
116
  for data in tqdm(data_l):
125
- data['rouge'] = compute_rouge_score_one_sample(
126
- data['gen_tok_str'], data['reference_tok_str']
127
- )
117
+ data['rouge'] = compute_rouge_score_one_sample(data['gen_tok_str'], data['reference_tok_str'])
128
118
  task_data_d = defaultdict(list)
129
119
  for data in data_l:
130
120
  for task in data['task_tags']:
131
121
  task_data_d[task].append(data)
132
122
 
133
123
  total_rouge = mean([data['rouge'][report_metric_key] for data in data_l])
134
- print(
135
- f'[total], count: {len(data_l)}, {report_metric_key}: '
136
- f'{total_rouge * 100:0.2f}%'
137
- )
124
+ print(f'[total], count: {len(data_l)}, {report_metric_key}: '
125
+ f'{total_rouge * 100:0.2f}%')
138
126
 
139
127
  for task, task_data in task_data_d.items():
140
128
  task_rouge = mean([data['rouge'][report_metric_key] for data in task_data])
141
- print(
142
- f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
143
- f'{task_rouge * 100:0.2f}%'
144
- )
129
+ print(f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
130
+ f'{task_rouge * 100:0.2f}%')
@@ -1,4 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.models.model import BaseModel
4
- from evalscope.models.model import ChatBaseModel
3
+ from evalscope.models.model import BaseModel, ChatBaseModel