evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,342 @@
1
+ # Copyright 2022 The rouge_score Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Computes rouge scores between two text blobs.
15
+
16
+ Implementation replicates the functionality in the original ROUGE package. See:
17
+
18
+ Lin, Chin-Yew. ROUGE: a Package for Automatic Evaluation of Summaries. In
19
+ Proceedings of the Workshop on Text Summarization Branches Out (WAS 2004),
20
+ Barcelona, Spain, July 25 - 26, 2004.
21
+
22
+ Default options are equivalent to running:
23
+ ROUGE-1.5.5.pl -e data -n 2 -a settings.xml
24
+
25
+ Or with use_stemmer=True:
26
+ ROUGE-1.5.5.pl -m -e data -n 2 -a settings.xml
27
+
28
+ In these examples settings.xml lists input files and formats.
29
+ """
30
+
31
+ from __future__ import absolute_import, division, print_function
32
+ import collections
33
+ import re
34
+
35
+ import nltk
36
+ import numpy as np
37
+ import six
38
+ from absl import logging
39
+ from rouge_score import scoring, tokenizers
40
+ from six.moves import map, range
41
+
42
+
43
+ class RougeScorer(scoring.BaseScorer):
44
+ """
45
+ Calculate rouges scores between two blobs of text.
46
+
47
+ Args:
48
+ rouge_types: A list of rouge types to calculate.
49
+ use_stemmer: Bool indicating whether Porter stemmer should be used to
50
+ strip word suffixes to improve matching. This arg is used in the
51
+ DefaultTokenizer, but other tokenizers might or might not choose to
52
+ use this.
53
+ split_summaries: whether to add newlines between sentences for rougeLsum
54
+ tokenizer: Tokenizer object which has a tokenize() method.
55
+
56
+ Returns:
57
+ A dict mapping rouge types to Score tuples.
58
+
59
+ Examples:
60
+ >>> scorer = RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
61
+ >>> scores = scorer.score('The quick brown fox jumps over the lazy dog',
62
+ ... 'The quick brown dog jumps on the log.')
63
+ """
64
+
65
+ def __init__(self,
66
+ rouge_types,
67
+ use_stemmer=False,
68
+ split_summaries=False,
69
+ tokenizer=None):
70
+
71
+ self.rouge_types = rouge_types
72
+ if tokenizer:
73
+ self._tokenizer = tokenizer
74
+ else:
75
+ self._tokenizer = tokenizers.DefaultTokenizer(use_stemmer)
76
+ logging.info('Using default tokenizer.')
77
+
78
+ self._split_summaries = split_summaries
79
+
80
+ def score_multi(self, targets, prediction):
81
+ """
82
+ Calculates rouge scores between targets and prediction.
83
+ The target with the maximum f-measure is used for the final score for each score type.
84
+
85
+ Args:
86
+ targets: list of texts containing the targets
87
+ prediction: Text containing the predicted text.
88
+
89
+ Returns:
90
+ A dict mapping each rouge type to a Score object.
91
+
92
+ Raises:
93
+ ValueError: If an invalid rouge type is encountered.
94
+ """
95
+
96
+ score_dicts = [self.score(t, prediction) for t in targets]
97
+ max_score = {}
98
+ for k in self.rouge_types:
99
+ index = np.argmax([s[k].fmeasure for s in score_dicts])
100
+ max_score[k] = score_dicts[index][k]
101
+
102
+ return max_score
103
+
104
+ def score(self, target, prediction):
105
+ """
106
+ Calculates rouge scores between the target and prediction.
107
+
108
+ Args:
109
+ target: Text containing the target (ground truth) text, or if a list
110
+ prediction: Text containing the predicted text.
111
+
112
+ Returns:
113
+ A dict mapping each rouge type to a Score object.
114
+
115
+ Raises:
116
+ ValueError: If an invalid rouge type is encountered.
117
+ """
118
+
119
+ # Pre-compute target tokens and prediction tokens for use by different
120
+ # types, except if only "rougeLsum" is requested.
121
+ if len(self.rouge_types) == 1 and self.rouge_types[0] == 'rougeLsum':
122
+ target_tokens = None
123
+ prediction_tokens = None
124
+ else:
125
+ target_tokens = self._tokenizer.tokenize(target)
126
+ prediction_tokens = self._tokenizer.tokenize(prediction)
127
+ result = {}
128
+
129
+ for rouge_type in self.rouge_types:
130
+ if rouge_type == 'rougeL':
131
+ # Rouge from longest common subsequences.
132
+ scores = _score_lcs(target_tokens, prediction_tokens)
133
+ elif rouge_type == 'rougeLsum':
134
+ # Note: Does not support multi-line text.
135
+ def get_sents(text):
136
+ if self._split_summaries:
137
+ sents = nltk.sent_tokenize(text)
138
+ else:
139
+ # Assume sentences are separated by newline.
140
+ sents = six.ensure_str(text).split('\n')
141
+ sents = [x for x in sents if len(x)]
142
+ return sents
143
+
144
+ target_tokens_list = [
145
+ self._tokenizer.tokenize(s) for s in get_sents(target)
146
+ ]
147
+ prediction_tokens_list = [
148
+ self._tokenizer.tokenize(s) for s in get_sents(prediction)
149
+ ]
150
+
151
+ scores = _summary_level_lcs(target_tokens_list,
152
+ prediction_tokens_list)
153
+ elif re.match(r'rouge[0-9]$', six.ensure_str(rouge_type)):
154
+ # Rouge from n-grams.
155
+ n = int(rouge_type[5:])
156
+ if n <= 0:
157
+ raise ValueError('rougen requires positive n: %s'
158
+ % rouge_type)
159
+ target_ngrams = _create_ngrams(target_tokens, n)
160
+ prediction_ngrams = _create_ngrams(prediction_tokens, n)
161
+ scores = _score_ngrams(target_ngrams, prediction_ngrams)
162
+ else:
163
+ raise ValueError('Invalid rouge type: %s' % rouge_type)
164
+ result[rouge_type] = scores
165
+
166
+ return result
167
+
168
+
169
+ def _create_ngrams(tokens, n):
170
+ """
171
+ Creates ngrams from the given list of tokens.
172
+
173
+ Args:
174
+ tokens: A list of tokens from which ngrams are created.
175
+ n: Number of tokens to use, e.g. 2 for bigrams.
176
+
177
+ Returns:
178
+ A dictionary mapping each bigram to the number of occurrences.
179
+ """
180
+
181
+ ngrams = collections.Counter()
182
+ for ngram in (tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)):
183
+ ngrams[ngram] += 1
184
+ return ngrams
185
+
186
+
187
+ def _score_lcs(target_tokens, prediction_tokens):
188
+ """
189
+ Computes LCS (Longest Common Subsequence) rouge scores.
190
+
191
+ Args:
192
+ target_tokens: Tokens from the target text.
193
+ prediction_tokens: Tokens from the predicted text.
194
+
195
+ Returns:
196
+ A Score object containing computed scores.
197
+ """
198
+
199
+ if not target_tokens or not prediction_tokens:
200
+ return scoring.Score(precision=0, recall=0, fmeasure=0)
201
+
202
+ # Compute length of LCS from the bottom up in a table (DP appproach).
203
+ lcs_table = _lcs_table(target_tokens, prediction_tokens)
204
+ lcs_length = lcs_table[-1][-1]
205
+
206
+ precision = lcs_length / len(prediction_tokens)
207
+ recall = lcs_length / len(target_tokens)
208
+ fmeasure = scoring.fmeasure(precision, recall)
209
+
210
+ return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
211
+
212
+
213
+ def _lcs_table(ref, can):
214
+ """Create 2-d LCS score table."""
215
+ rows = len(ref)
216
+ cols = len(can)
217
+ lcs_table = [[0] * (cols + 1) for _ in range(rows + 1)]
218
+ for i in range(1, rows + 1):
219
+ for j in range(1, cols + 1):
220
+ if ref[i - 1] == can[j - 1]:
221
+ lcs_table[i][j] = lcs_table[i - 1][j - 1] + 1
222
+ else:
223
+ lcs_table[i][j] = max(lcs_table[i - 1][j], lcs_table[i][j - 1])
224
+ return lcs_table
225
+
226
+
227
+ def _backtrack_norec(t, ref, can):
228
+ """Read out LCS."""
229
+ i = len(ref)
230
+ j = len(can)
231
+ lcs = []
232
+ while i > 0 and j > 0:
233
+ if ref[i - 1] == can[j - 1]:
234
+ lcs.insert(0, i - 1)
235
+ i -= 1
236
+ j -= 1
237
+ elif t[i][j - 1] > t[i - 1][j]:
238
+ j -= 1
239
+ else:
240
+ i -= 1
241
+ return lcs
242
+
243
+
244
+ def _summary_level_lcs(ref_sent, can_sent):
245
+ """
246
+ ROUGE: Summary-level LCS, section 3.2 in ROUGE paper.
247
+
248
+ Args:
249
+ ref_sent: list of tokenized reference sentences
250
+ can_sent: list of tokenized candidate sentences
251
+
252
+ Returns:
253
+ summary level ROUGE score
254
+ """
255
+
256
+ if not ref_sent or not can_sent:
257
+ return scoring.Score(precision=0, recall=0, fmeasure=0)
258
+
259
+ m = sum(map(len, ref_sent))
260
+ n = sum(map(len, can_sent))
261
+ if not n or not m:
262
+ return scoring.Score(precision=0, recall=0, fmeasure=0)
263
+
264
+ # get token counts to prevent double counting
265
+ token_cnts_r = collections.Counter()
266
+ token_cnts_c = collections.Counter()
267
+ for s in ref_sent:
268
+ # s is a list of tokens
269
+ token_cnts_r.update(s)
270
+ for s in can_sent:
271
+ token_cnts_c.update(s)
272
+
273
+ hits = 0
274
+ for r in ref_sent:
275
+ lcs = _union_lcs(r, can_sent)
276
+ # Prevent double-counting:
277
+ # The paper describes just computing hits += len(_union_lcs()),
278
+ # but the implementation prevents double counting. We also
279
+ # implement this as in version 1.5.5.
280
+ for t in lcs:
281
+ if token_cnts_c[t] > 0 and token_cnts_r[t] > 0:
282
+ hits += 1
283
+ token_cnts_c[t] -= 1
284
+ token_cnts_r[t] -= 1
285
+
286
+ recall = hits / m
287
+ precision = hits / n
288
+ fmeasure = scoring.fmeasure(precision, recall)
289
+ return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
290
+
291
+
292
+ def _union_lcs(ref, c_list):
293
+ """
294
+ Find union LCS between a ref sentence and list of candidate sentences.
295
+
296
+ Args:
297
+ ref: list of tokens
298
+ c_list: list of list of indices for LCS into reference summary
299
+
300
+ Returns:
301
+ List of tokens in ref representing union LCS.
302
+ """
303
+
304
+ lcs_list = [lcs_ind(ref, c) for c in c_list]
305
+ return [ref[i] for i in _find_union(lcs_list)]
306
+
307
+
308
+ def _find_union(lcs_list):
309
+ """Finds union LCS given a list of LCS."""
310
+ return sorted(list(set().union(*lcs_list)))
311
+
312
+
313
+ def lcs_ind(ref, can):
314
+ """Returns one of the longest lcs."""
315
+ t = _lcs_table(ref, can)
316
+ return _backtrack_norec(t, ref, can)
317
+
318
+
319
+ def _score_ngrams(target_ngrams, prediction_ngrams):
320
+ """
321
+ Computes n-gram based rouge scores.
322
+
323
+ Args:
324
+ target_ngrams: A Counter object mapping each ngram to number of occurrences for the target text.
325
+ prediction_ngrams: A Counter object mapping each ngram to number of occurrences for the prediction text.
326
+
327
+ Returns:
328
+ A Score object containing computed scores.
329
+ """
330
+
331
+ intersection_ngrams_count = 0
332
+ for ngram in six.iterkeys(target_ngrams):
333
+ intersection_ngrams_count += min(target_ngrams[ngram],
334
+ prediction_ngrams[ngram])
335
+ target_ngrams_count = sum(target_ngrams.values())
336
+ prediction_ngrams_count = sum(prediction_ngrams.values())
337
+
338
+ precision = intersection_ngrams_count / max(prediction_ngrams_count, 1)
339
+ recall = intersection_ngrams_count / max(target_ngrams_count, 1)
340
+ fmeasure = scoring.fmeasure(precision, recall)
341
+
342
+ return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
@@ -0,0 +1,104 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import inspect
4
+ import re
5
+ import signal
6
+ from collections import defaultdict
7
+
8
+ from tqdm import tqdm
9
+
10
+
11
+ def handle(signum, frame):
12
+ raise RuntimeError('程序执行超过10秒')
13
+
14
+
15
+ def check_input(text, arg):
16
+ pattern = r'input\((.*?)\n'
17
+ text = re.sub(pattern, '{}\n'.format(arg), text)
18
+
19
+ code_block_pattern = re.compile(r'```[Pp]ython\n(.*?)\n```', re.DOTALL)
20
+ code_block = code_block_pattern.search(text)
21
+ code_string = code_block.group(1)
22
+
23
+ function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(',
24
+ re.DOTALL)
25
+ function_name_block = function_name_pattern.search(code_string)
26
+ function_name = function_name_block.group(1)
27
+
28
+ return code_string, function_name
29
+
30
+
31
+ def compile_func(code_string, function_name):
32
+ signal.signal(signal.SIGALRM, handle)
33
+ signal.alarm(10)
34
+ myMod = compile(code_string, '', 'exec')
35
+ exec(myMod)
36
+ func = eval(function_name)
37
+ signal.alarm(0)
38
+ return func
39
+
40
+
41
+ def exec_func(func, arr):
42
+ signal.signal(signal.SIGALRM, handle)
43
+ signal.alarm(10)
44
+ sig = inspect.signature(func)
45
+ params = [param for param in sig.parameters]
46
+ if len(params) == 0:
47
+ result = func()
48
+ else:
49
+ result = func(arr)
50
+ signal.alarm(0)
51
+ return result
52
+
53
+
54
+ def compute_pass_k_one_sample(predict, func_args, func_outputs, k=4):
55
+ assert len(
56
+ predict
57
+ ) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
58
+ for predict_i in predict[:k]:
59
+ try:
60
+ for arg, gold in zip(func_args, func_outputs):
61
+ code_string, function_name = check_input(predict_i, arg)
62
+ func = compile_func(code_string, function_name)
63
+
64
+ exec_result = exec_func(func, arg)
65
+ if type(exec_result) is tuple:
66
+ exec_result = list(exec_result)
67
+ exec_result = str(exec_result).lower()
68
+ assert exec_result == str(gold).lower()
69
+ del func
70
+ return 1
71
+ except Exception as e:
72
+ print(e)
73
+ return 0
74
+
75
+
76
+ def compute_pass_k(predict_l, reference_l, func_args_l, k=4, lang='py'):
77
+ if lang != 'py':
78
+ print('only support python code.')
79
+
80
+ assert len(predict_l) == len(reference_l) == len(func_args_l)
81
+ pass_k_cnt = 0
82
+ for predict, ref, func_args in zip(predict_l, reference_l, func_args_l):
83
+ pass_k_cnt += compute_pass_k_one_sample(predict, ref, func_args, k)
84
+ return {'pass@k': pass_k_cnt / len(predict_l)}
85
+
86
+
87
+ def run_code_eval(data_l, k=4, md_level=2):
88
+ print(f"{'#' * md_level} Code Eval(pass@{k})")
89
+ for data in tqdm(data_l):
90
+ data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'],
91
+ data['func_args'],
92
+ data['func_outputs'], k)
93
+ task_data_d = defaultdict(list)
94
+ for data in data_l:
95
+ for task in data['task_tags']:
96
+ task_data_d[task].append(data)
97
+
98
+ correct_cnt = sum([data[f'pass@{k}'] for data in data_l])
99
+ print(f'[total], count: {len(data_l)}, pass@{k}: '
100
+ f'{correct_cnt / len(data_l) * 100:0.2f}%')
101
+ for task in task_data_d.keys():
102
+ correct_cnt = sum([data[f'pass@{k}'] for data in task_data_d[task]])
103
+ print(f'[{task}], count: {len(task_data_d[task])}, pass@{k}: '
104
+ f'{correct_cnt / len(task_data_d[task]) * 100:0.2f}%')
@@ -0,0 +1,60 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import re
4
+ from collections import defaultdict
5
+
6
+ from tqdm import tqdm
7
+
8
+ from evalscope.constants import MetricsConstant
9
+
10
+
11
+ def get_last_number(s):
12
+ match = re.search(r'[-+]?\d*\.\d+|\d+', s[::-1])
13
+ if match:
14
+ last_digit = match.group()[::-1]
15
+ else:
16
+ last_digit = -100000
17
+ return float(last_digit)
18
+
19
+
20
+ def compute_math_accuracy_one_sample(predict, reference):
21
+ if isinstance(predict, list):
22
+ predict = predict[0]
23
+ if isinstance(reference, list):
24
+ reference = reference[0]
25
+ predict_number = get_last_number(predict)
26
+ reference_number = get_last_number(reference)
27
+ if abs(predict_number - reference_number) <= MetricsConstant.EPSILON:
28
+ return 1
29
+ else:
30
+ return 0
31
+
32
+
33
+ def compute_math_accuracy(predict_l, reference_l):
34
+ assert len(predict_l) == len(reference_l)
35
+ if len(predict_l) == 0:
36
+ return 0
37
+ total_cnt = len(predict_l)
38
+ correct_cnt = 0
39
+ for predict, reference in zip(predict_l, reference_l):
40
+ correct_cnt += compute_math_accuracy_one_sample(predict, reference)
41
+ return {'math accuracy': correct_cnt / total_cnt}
42
+
43
+
44
+ def run_math_eval(data_l, md_level=2):
45
+ print(f"{'#' * md_level} Math Eval(math accuracy)")
46
+ for data in tqdm(data_l):
47
+ data['math_accuracy'] = compute_math_accuracy_one_sample(
48
+ data['gen'], data['target'])
49
+ task_data_d = defaultdict(list)
50
+ for data in data_l:
51
+ for task in data['task_tags']:
52
+ task_data_d[task].append(data)
53
+ correct_cnt = sum([data['math_accuracy'] for data in data_l])
54
+ print(f'[total], count: {len(data_l)}, math accuracy: '
55
+ f'{correct_cnt / len(data_l) * 100:0.2f}%')
56
+ for task in task_data_d.keys():
57
+ correct_cnt = sum(
58
+ [data['math_accuracy'] for data in task_data_d[task]])
59
+ print(f'[{task}], count: {len(task_data_d[task])}, math accuracy: '
60
+ f'{correct_cnt/len(task_data_d[task])*100:0.2f}%')