evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,405 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright (c) EleutherAI. and its affiliates.
3
+ # Copyright (c) OpenAI. and its affiliates.
4
+ import itertools
5
+ import math
6
+ from collections.abc import Iterable
7
+ from collections import defaultdict
8
+ from typing import Dict, List, Union
9
+ from nltk.translate.bleu_score import sentence_bleu
10
+ from nltk import word_tokenize
11
+ import jieba
12
+
13
+ import numpy as np
14
+ import sacrebleu
15
+ import sklearn.metrics
16
+ import random
17
+
18
+
19
+ def mean(arr):
20
+ return sum(arr) / len(arr)
21
+
22
+
23
+ def pop_stddev(arr):
24
+ mu = mean(arr)
25
+ return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
26
+
27
+
28
+ def sample_stddev(arr):
29
+ mu = mean(arr)
30
+ return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
31
+
32
+
33
+ def mean_stderr(arr):
34
+ return sample_stddev(arr) / math.sqrt(len(arr))
35
+
36
+
37
+ def median(arr):
38
+ return arr[len(arr) // 2]
39
+
40
+
41
+ def matthews_corrcoef(items):
42
+ unzipped_list = list(zip(*items))
43
+ golds = unzipped_list[0]
44
+ preds = unzipped_list[1]
45
+ return sklearn.metrics.matthews_corrcoef(golds, preds)
46
+
47
+
48
+ def f1_score(items):
49
+ unzipped_list = list(zip(*items))
50
+ golds = unzipped_list[0]
51
+ preds = unzipped_list[1]
52
+ fscore = sklearn.metrics.f1_score(golds, preds)
53
+
54
+ return np.max(fscore)
55
+
56
+
57
+ def acc_all(items):
58
+ # Only count as correct if all answers are labeled correctly for each question
59
+ question_scoring_dict = {}
60
+ preds = list(zip(*items))[0]
61
+ docs = list(zip(*items))[1]
62
+
63
+ for doc, pred in zip(docs, preds):
64
+ paragraph_id = doc['idx']['paragraph']
65
+ question_id = doc['idx']['question']
66
+ if (paragraph_id, question_id) not in question_scoring_dict:
67
+ question_scoring_dict[(paragraph_id, question_id)] = []
68
+
69
+ gold_label = doc['label'] == 1
70
+
71
+ question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
72
+ acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
73
+ return acc
74
+
75
+
76
+ def acc_all_stderr(items):
77
+ # Only count as correct if all answers are labeled correctly for each question
78
+ question_scoring_dict = {}
79
+ preds = list(zip(*items))[0]
80
+ docs = list(zip(*items))[1]
81
+
82
+ for doc, pred in zip(docs, preds):
83
+ question_id = doc['idx']['question']
84
+ if question_id not in question_scoring_dict:
85
+ question_scoring_dict[question_id] = []
86
+
87
+ gold_label = doc['label'] == 1
88
+ question_scoring_dict[question_id].append(gold_label == pred)
89
+
90
+ acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
91
+ return acc
92
+
93
+
94
+ def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
95
+ """Compute max metric between prediction and each ground truth."""
96
+ scores_for_ground_truths = []
97
+ for ground_truth in ground_truths:
98
+ score = metric_fn(prediction, ground_truth)
99
+ scores_for_ground_truths.append(score)
100
+ return max(scores_for_ground_truths)
101
+
102
+
103
+ def perplexity(items):
104
+ return math.exp(-mean(items))
105
+
106
+
107
+ def weighted_mean(items) -> float:
108
+ # e.g. [(0,1), (0.5,1), (1,1)]
109
+ a, b = zip(*items)
110
+ return sum(a) / sum(b)
111
+
112
+
113
+ def weighted_perplexity(items):
114
+ return math.exp(-weighted_mean(items))
115
+
116
+
117
+ def bits_per_byte(items):
118
+ return -weighted_mean(items) / math.log(2)
119
+
120
+
121
+ def bleu(items):
122
+ """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
123
+ for evaluating a generated sentence to a reference sentence. It counts matching
124
+ n-grams in the candidate translation to n-grams in the reference text, where
125
+ 1-gram or unigram would be each token and a bigram comparison would be each
126
+ word pair. The comparison is made regardless of word order
127
+ Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
128
+ Paper: https://www.aclweb.org/anthology/P02-1040/
129
+
130
+ Higher is better
131
+ """
132
+ refs = list(zip(*items))[0]
133
+ preds = list(zip(*items))[1]
134
+ refs, preds = _sacreformat(refs, preds)
135
+ return sacrebleu.corpus_bleu(preds, refs).score
136
+
137
+ def bleu_ngram_one_sample(predict, reference):
138
+ """
139
+ Calculate BLEU-1, BLEU-2, BLEU-3, and BLEU-4 scores
140
+
141
+ Args:
142
+ items: [(ref, pred)]
143
+
144
+ Returns:
145
+ {
146
+ 'bleu-1': 0.8,
147
+ 'bleu-2': 0.45,
148
+ 'bleu-3': 0.0,
149
+ 'bleu-4': 0.0
150
+ }
151
+
152
+ """
153
+ def is_contains_chinese(strs):
154
+ for _char in strs:
155
+ if '\u4e00' <= _char <= '\u9fa5':
156
+ return True
157
+ return False
158
+
159
+ predict = list(jieba.cut(predict)) if is_contains_chinese(predict) else word_tokenize(predict)
160
+ reference = [list(jieba.cut(reference))] if is_contains_chinese(reference) else [word_tokenize(reference)]
161
+
162
+ result = dict()
163
+ result['bleu-1'] = sentence_bleu(reference, predict, weights=(1, 0, 0, 0))
164
+ result['bleu-2'] = sentence_bleu(reference, predict, weights=(0, 1, 0, 0))
165
+ result['bleu-3'] = sentence_bleu(reference, predict, weights=(0, 0, 1, 0))
166
+ result['bleu-4'] = sentence_bleu(reference, predict, weights=(0, 0, 0, 1))
167
+
168
+ return result
169
+
170
+
171
+ def chrf(items):
172
+ """chrF++ is a tool for automatic evaluation of machine translation output
173
+ based on character n-gram precision and recall enhanced with word n-grams.
174
+ Source: https://github.com/m-popovic/chrF
175
+ Paper: https://www.aclweb.org/anthology/W15-3049.pdf
176
+
177
+ Higher is better # TODO I think
178
+ """
179
+ refs = list(zip(*items))[0]
180
+ preds = list(zip(*items))[1]
181
+ refs, preds = _sacreformat(refs, preds)
182
+ return sacrebleu.corpus_chrf(preds, refs).score
183
+
184
+
185
+ def ter(items):
186
+ """Translation Error Rate is an error metric for machine translation that
187
+ measures the number of edits required to change a system output into one
188
+ of the references
189
+ Source: http://www.cs.umd.edu/~snover/tercom/
190
+ Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
191
+
192
+ Lower is better
193
+ """
194
+ refs = list(zip(*items))[0]
195
+ preds = list(zip(*items))[1]
196
+ refs, preds = _sacreformat(refs, preds)
197
+ return sacrebleu.corpus_ter(preds, refs).score
198
+
199
+
200
+ def is_non_str_iterable(obj):
201
+ return isinstance(obj, Iterable) and not isinstance(obj, str)
202
+
203
+
204
+ def _sacreformat(refs, preds):
205
+ """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
206
+ # Sacrebleu expects (List[str], List[List[str])
207
+ # e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
208
+
209
+ # Note [ref1_stream] is the first reference for each pred.
210
+ # So lists are size N and (M, N) for N preds and M possible refs for each pred
211
+ # This is a different order of dimensions that I would expect
212
+
213
+ # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
214
+ # Must become List[List[str]] with the inner list corresponding to preds
215
+ if not is_non_str_iterable(refs):
216
+ refs = list(refs)
217
+ if not is_non_str_iterable(refs[0]):
218
+ refs = [[ref] for ref in refs]
219
+ refs = list(zip(*refs))
220
+ # Note the number of refs in each ref list much match the number of preds
221
+
222
+ # We expect preds to be List[str] or List[List[str]]. Must become List[str]
223
+ if not is_non_str_iterable(preds):
224
+ preds = list(preds)
225
+ if is_non_str_iterable(preds[0]):
226
+ assert len(preds[0]) == 1, f'Pred must be a str, was {preds[0]}'
227
+ preds = [pred[0] for pred in preds]
228
+
229
+ return refs, preds
230
+
231
+
232
+ class _bootstrap_internal:
233
+ def __init__(self, f, n):
234
+ self.f = f
235
+ self.n = n
236
+
237
+ def __call__(self, v):
238
+ i, xs = v
239
+ rnd = random.Random()
240
+ rnd.seed(i)
241
+ res = []
242
+ for _ in range(self.n):
243
+ res.append(self.f(rnd.choices(xs, k=len(xs))))
244
+ return res
245
+
246
+
247
+ def bootstrap_stderr(f, xs, iters):
248
+ import multiprocessing as mp
249
+
250
+ pool = mp.Pool(mp.cpu_count())
251
+ # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
252
+ # equivalent to stderr calculated without Bessel's correction in the stddev.
253
+ # Unfortunately, I haven't been able to figure out what the right correction is
254
+ # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
255
+ # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
256
+ # Thankfully, shouldn't matter because our samples are pretty big usually anyways
257
+ res = []
258
+ chunk_size = min(1000, iters)
259
+ from tqdm import tqdm
260
+
261
+ print('bootstrapping for stddev:', f.__name__)
262
+ for bootstrap in tqdm(
263
+ pool.imap(
264
+ _bootstrap_internal(f, chunk_size),
265
+ [(i, xs) for i in range(iters // chunk_size)],
266
+ ),
267
+ total=iters // chunk_size,
268
+ ):
269
+ # sample w replacement
270
+ res.extend(bootstrap)
271
+
272
+ pool.close()
273
+ return sample_stddev(res)
274
+
275
+
276
+ def stderr_for_metric(metric, bootstrap_iters):
277
+ bootstrappable = [
278
+ median,
279
+ matthews_corrcoef,
280
+ f1_score,
281
+ perplexity,
282
+ bleu,
283
+ chrf,
284
+ ter,
285
+ ]
286
+
287
+ if metric in bootstrappable:
288
+ return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
289
+
290
+ stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
291
+
292
+ return stderr.get(metric, None)
293
+
294
+
295
+ def yesno(x):
296
+ if x:
297
+ return 'yes'
298
+ else:
299
+ return 'no'
300
+
301
+
302
+ def compute_elo(battles,
303
+ col_model_a='model_a',
304
+ col_model_b='model_b',
305
+ col_win='win',
306
+ tie_values=['tie', 'tie (bothbad)'],
307
+ k=32,
308
+ scale=400,
309
+ base=10,
310
+ init_rating=1000):
311
+ rating = defaultdict(lambda: init_rating)
312
+
313
+ for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
314
+ ra = rating[model_a]
315
+ rb = rating[model_b]
316
+ ea = 1 / (1 + base**((rb - ra) / scale))
317
+ eb = 1 / (1 + base**((ra - rb) / scale))
318
+ if win == col_model_a:
319
+ sa = 1
320
+ elif win == col_model_b:
321
+ sa = 0
322
+ elif win in tie_values:
323
+ sa = 0.5
324
+ else:
325
+ raise Exception(f'unexpected vote {win}')
326
+ rating[model_a] += k * (sa - ea)
327
+ rating[model_b] += k * (1 - sa - eb)
328
+
329
+ return rating
330
+
331
+
332
+ def exact_match(gold: str, pred: str) -> float:
333
+ if not pred:
334
+ return 0
335
+
336
+ return 1 if gold.strip() == pred.strip() else 0
337
+
338
+
339
+ def calculate_arc_accuracy(question_answers: Dict[str, str], predictions: Dict[str, List[str]]) -> float:
340
+ """
341
+ Calculate accuracy for ARC benchmark.
342
+
343
+ Args:
344
+ question_answers: question_id -> answer mapping, e.g. {'abc_123': 'A'}
345
+ predictions: question_id -> prediction mapping, e.g. {'abc_123': ['D'], 'xyz_456': ['A', 'C']}
346
+
347
+ Returns:
348
+ accuracy score (float)
349
+
350
+ Notes:
351
+ Each question is worth one point. Models are allowed to give multiple answers (e.g., "A;C"),
352
+ in which case the model receives 1/N points credit if one of its N answers is correct.
353
+ Refer to: https://leaderboard.allenai.org/arc/submissions/get-started
354
+ """
355
+ score = 0.0
356
+
357
+ for question_id, answer in question_answers.items():
358
+ try:
359
+ predictions_for_q = predictions[question_id]
360
+ except Exception as e:
361
+ raise KeyError(f'Missing arc prediction: {e}')
362
+
363
+ if answer in predictions_for_q:
364
+ score += 1.0 / len(predictions_for_q)
365
+
366
+ del predictions[question_id]
367
+
368
+ if len(predictions) > 0:
369
+ log_ex: str = ', '.join(list(predictions.keys())[:3])
370
+ raise ValueError(f'Found {len(predictions)} extra predictions, for example: {log_ex}')
371
+
372
+ return score / len(question_answers)
373
+
374
+
375
+ def calculate_pass_at_k(
376
+ num_samples: Union[int, List[int], np.ndarray],
377
+ num_correct: Union[List[int], np.ndarray],
378
+ k: int = 1
379
+ ) -> np.ndarray:
380
+ """
381
+ Estimates pass@k of each problem and returns them in an array.
382
+ Examples:
383
+ >>> import numpy as np
384
+ >>> from typing import Union
385
+ >>> total = np.array([5, 5, 5])
386
+ >>> correct = np.array([2, 4, 2])
387
+ >>> calculate_pass_at_k(total, correct, 1)
388
+ result: "array([0.4, 0.8, 0.4])"
389
+ """
390
+
391
+ def estimator(n: int, c: int, k: int) -> float:
392
+ """
393
+ Calculates 1 - comb(n - c, k) / comb(n, k).
394
+ """
395
+ if n - c < k:
396
+ return 1.0
397
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
398
+
399
+ if isinstance(num_samples, int):
400
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
401
+ else:
402
+ assert len(num_samples) == len(num_correct)
403
+ num_samples_it = iter(num_samples)
404
+
405
+ return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
@@ -0,0 +1,129 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import logging
4
+ from collections import defaultdict
5
+ from pathlib import Path
6
+ from statistics import mean
7
+
8
+ from tqdm import tqdm
9
+
10
+ from evalscope.constants import MetricsConstant
11
+ from evalscope.metrics.bundled_rouge_score import rouge_scorer
12
+ from evalscope.preprocess.tokenizers.gpt2_tokenizer import DummyTokenizer
13
+ from rouge_chinese import Rouge
14
+ import jieba
15
+
16
+ HERE = Path(__file__).absolute().parent
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
21
+ tokenizer=DummyTokenizer())
22
+ zh_scorer = Rouge()
23
+
24
+
25
+ def is_contains_chinese(strs):
26
+ for _char in strs:
27
+ if '\u4e00' <= _char <= '\u9fa5':
28
+ return True
29
+ return False
30
+
31
+ def compute_rouge_score(predict_l, reference_l):
32
+ assert len(predict_l) == len(reference_l)
33
+ if len(predict_l) == 0:
34
+ tmp_d = dict()
35
+ for key in MetricsConstant.ROUGE_KEYS:
36
+ tmp_d[key] = 0
37
+ return tmp_d
38
+
39
+ result = defaultdict(list)
40
+ for p, r in tqdm(zip(predict_l, reference_l)):
41
+ one_sample = compute_rouge_score_one_sample(p, r)
42
+ for rouge_key in MetricsConstant.ROUGE_KEYS:
43
+ result[rouge_key].append(one_sample[rouge_key])
44
+ rlt = {}
45
+ for rouge_key in MetricsConstant.ROUGE_KEYS:
46
+ rlt[rouge_key] = mean(result[rouge_key]) * 100 if rouge_key in result \
47
+ else MetricsConstant.INVALID_VALUE
48
+ return rlt
49
+
50
+ def compute_rouge_score_one_sample_zh(predict, reference):
51
+ result = dict()
52
+ for p, r in zip(predict, reference):
53
+ p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
54
+ r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
55
+
56
+ score = zh_scorer.get_scores(p, r)[0]
57
+ result['rouge-1-r'] = score['rouge-1']['r']
58
+ result['rouge-1-p'] = score['rouge-1']['p']
59
+ result['rouge-1-f'] = score['rouge-1']['f']
60
+ result['rouge-2-r'] = score['rouge-2']['r']
61
+ result['rouge-2-p'] = score['rouge-2']['p']
62
+ result['rouge-2-f'] = score['rouge-2']['f']
63
+ result['rouge-l-r'] = score['rouge-l']['r']
64
+ result['rouge-l-p'] = score['rouge-l']['p']
65
+ result['rouge-l-f'] = score['rouge-l']['f']
66
+
67
+ return result
68
+
69
+ def compute_rouge_score_one_sample(predict, reference):
70
+ result = dict()
71
+ for p, r in zip(predict, reference):
72
+ score = scorer.score(p, r)
73
+ result['rouge-1-r'] = score['rouge1'].recall
74
+ result['rouge-1-p'] = score['rouge1'].precision
75
+ result['rouge-1-f'] = score['rouge1'].fmeasure
76
+ result['rouge-2-r'] = score['rouge2'].recall
77
+ result['rouge-2-p'] = score['rouge2'].precision
78
+ result['rouge-2-f'] = score['rouge2'].fmeasure
79
+ result['rouge-l-r'] = score['rougeL'].recall
80
+ result['rouge-l-p'] = score['rougeL'].precision
81
+ result['rouge-l-f'] = score['rougeL'].fmeasure
82
+
83
+ return result
84
+
85
+
86
+ def _to_table(final_result) -> str:
87
+ table = []
88
+ # step 1. table header
89
+ all_tasks = ['', 'total']
90
+ all_tasks.extend(final_result['all_tasks'].split(','))
91
+ table.append('\t'.join(all_tasks))
92
+
93
+ # step 2. table row
94
+ for rouge_key in MetricsConstant.ROUGE_KEYS:
95
+ row = [rouge_key]
96
+ for task in all_tasks:
97
+ if not task:
98
+ continue
99
+ elif task == 'total':
100
+ row.append(
101
+ f'{final_result["total"]["rouge"][rouge_key] :0.2f}')
102
+ else:
103
+ row.append(
104
+ f'{final_result["tasks"][task]["rouge"][rouge_key] :0.2f}')
105
+ table.append('\t'.join(row))
106
+
107
+ return '\n'.join(table)
108
+
109
+
110
+ def run_rouge_eval(data_l, md_level=2, report_metric_key='rouge-l-f'):
111
+ print(f"{'#' * md_level} Rouge Eval")
112
+ for data in tqdm(data_l):
113
+ data['rouge'] = compute_rouge_score_one_sample(
114
+ data['gen_tok_str'], data['reference_tok_str'])
115
+ task_data_d = defaultdict(list)
116
+ for data in data_l:
117
+ for task in data['task_tags']:
118
+ task_data_d[task].append(data)
119
+
120
+ total_rouge = mean([data['rouge'][report_metric_key] for data in data_l])
121
+ print(f'[total], count: {len(data_l)}, {report_metric_key}: '
122
+ f'{total_rouge * 100:0.2f}%')
123
+
124
+ for task, task_data in task_data_d.items():
125
+ task_rouge = mean(
126
+ [data['rouge'][report_metric_key] for data in task_data])
127
+ print(
128
+ f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
129
+ f'{task_rouge * 100:0.2f}%')
@@ -0,0 +1,4 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.models.model import BaseModel
4
+ from evalscope.models.model import ChatBaseModel
@@ -0,0 +1,4 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.models.custom.custom_model import *
4
+
@@ -0,0 +1,53 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Union, Dict, List
4
+ import torch
5
+
6
+
7
+ class CustomModel(ABC):
8
+
9
+ def __init__(self, config: dict, **kwargs):
10
+ self.config = config
11
+ self.kwargs = kwargs
12
+
13
+ if config.get('model_id', None) is None:
14
+ raise ValueError(f"**Error: model_id is required in config for CustomModel. Got config: {config}")
15
+
16
+ @abstractmethod
17
+ @torch.no_grad()
18
+ def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
19
+ """
20
+ Model prediction function for batch inputs.
21
+
22
+ Args:
23
+ prompts (str): The input batch of prompts to predict.
24
+
25
+ **kwargs: kwargs
26
+
27
+ Returns:
28
+ res (dict): The model prediction results (batch). Format:
29
+ [
30
+ {
31
+ 'choices': [
32
+ {
33
+ 'index': 0,
34
+ 'message': {
35
+ 'content': 'xxx',
36
+ 'role': 'assistant'
37
+ }
38
+ }
39
+ ],
40
+ 'created': 1677664795,
41
+ 'model': 'gpt-3.5-turbo-0613', # should be model_id
42
+ 'object': 'chat.completion',
43
+ 'usage': {
44
+ 'completion_tokens': 17,
45
+ 'prompt_tokens': 57,
46
+ 'total_tokens': 74
47
+ }
48
+ }
49
+ ,
50
+ ...
51
+ ]
52
+ """
53
+ raise NotImplementedError
@@ -0,0 +1,50 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import random
4
+ import time
5
+ from evalscope.models import ChatBaseModel
6
+ from evalscope.utils.logger import get_logger
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ class DummyChatModel(ChatBaseModel):
12
+
13
+ MODEL_ID = 'dummy_chat_model_0801'
14
+ REVISION = 'v1.0.0'
15
+
16
+ def __init__(self, model_cfg: dict, **kwargs):
17
+ model_cfg['model_id'] = self.MODEL_ID
18
+ model_cfg['revision'] = self.REVISION
19
+ super(DummyChatModel, self).__init__(model_cfg=model_cfg)
20
+
21
+ def predict(self, inputs: dict, **kwargs) -> dict:
22
+
23
+ debug: bool = False
24
+ if debug:
25
+ messages = inputs['messages']
26
+ history = inputs['history']
27
+
28
+ logger.info(f'** messages: {messages}')
29
+ logger.info(f'** history: {history}')
30
+
31
+ choice = random.choice(['A', 'B', 'C', 'D'])
32
+
33
+ # Build response
34
+ res = {
35
+ 'choices': [
36
+ {
37
+ 'index': 0,
38
+ 'message': {
39
+ 'content': choice,
40
+ 'role': 'assistant'
41
+ }
42
+ }
43
+ ],
44
+ 'created': time.time(),
45
+ 'model': self.MODEL_ID + '-' + self.REVISION,
46
+ 'object': 'chat.completion',
47
+ 'usage': {}
48
+ }
49
+
50
+ return res