evalscope 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +3 -0
- evalscope/backend/__init__.py +3 -0
- evalscope/backend/base.py +27 -0
- evalscope/backend/opencompass/__init__.py +3 -0
- evalscope/backend/opencompass/api_meta_template.py +64 -0
- evalscope/backend/opencompass/backend_manager.py +247 -0
- evalscope/backend/opencompass/tasks/__init__.py +1 -0
- evalscope/backend/opencompass/tasks/eval_api.py +30 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
- evalscope/backend/vlm_eval_kit/__init__.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
- evalscope/benchmarks/__init__.py +4 -0
- evalscope/benchmarks/arc/__init__.py +5 -0
- evalscope/benchmarks/arc/ai2_arc.py +148 -0
- evalscope/benchmarks/arc/arc_adapter.py +231 -0
- evalscope/benchmarks/bbh/__init__.py +6 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
- evalscope/benchmarks/benchmark.py +65 -0
- evalscope/benchmarks/ceval/__init__.py +5 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
- evalscope/benchmarks/ceval/ceval_exam.py +159 -0
- evalscope/benchmarks/cmmlu/__init__.py +5 -0
- evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
- evalscope/benchmarks/competition_math/__init__.py +5 -0
- evalscope/benchmarks/competition_math/competition_math.py +88 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
- evalscope/benchmarks/data_adapter.py +263 -0
- evalscope/benchmarks/general_qa/__init__.py +5 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
- evalscope/benchmarks/gsm8k/__init__.py +5 -0
- evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
- evalscope/benchmarks/hellaswag/__init__.py +5 -0
- evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
- evalscope/benchmarks/humaneval/__init__.py +5 -0
- evalscope/benchmarks/humaneval/humaneval.py +82 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
- evalscope/benchmarks/mmlu/__init__.py +5 -0
- evalscope/benchmarks/mmlu/mmlu.py +174 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
- evalscope/benchmarks/race/__init__.py +5 -0
- evalscope/benchmarks/race/race.py +118 -0
- evalscope/benchmarks/race/race_adapter.py +229 -0
- evalscope/benchmarks/trivia_qa/__init__.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
- evalscope/benchmarks/truthful_qa/__init__.py +5 -0
- evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
- evalscope/cache.py +98 -0
- evalscope/cli/__init__.py +1 -0
- evalscope/cli/base.py +20 -0
- evalscope/cli/cli.py +26 -0
- evalscope/cli/start_perf.py +37 -0
- evalscope/cli/start_server.py +138 -0
- evalscope/config.py +165 -0
- evalscope/constants.py +150 -0
- evalscope/evaluator/__init__.py +3 -0
- evalscope/evaluator/evaluator.py +689 -0
- evalscope/evaluator/rating_eval.py +178 -0
- evalscope/evaluator/reviewer/__init__.py +1 -0
- evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
- evalscope/metrics/__init__.py +1 -0
- evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
- evalscope/metrics/code_metric.py +104 -0
- evalscope/metrics/math_accuracy.py +60 -0
- evalscope/metrics/metrics.py +405 -0
- evalscope/metrics/rouge_metric.py +129 -0
- evalscope/models/__init__.py +4 -0
- evalscope/models/custom/__init__.py +4 -0
- evalscope/models/custom/custom_model.py +53 -0
- evalscope/models/dummy_chat_model.py +50 -0
- evalscope/models/model.py +88 -0
- evalscope/models/model_adapter.py +586 -0
- evalscope/models/openai_model.py +103 -0
- evalscope/models/template.py +1446 -0
- evalscope/perf/__init__.py +0 -0
- evalscope/perf/_logging.py +32 -0
- evalscope/perf/api_plugin_base.py +60 -0
- evalscope/perf/custom_api.py +87 -0
- evalscope/perf/dashscope_api.py +84 -0
- evalscope/perf/dataset_plugin_base.py +64 -0
- evalscope/perf/datasets/__init__.py +0 -0
- evalscope/perf/datasets/line_by_line.py +18 -0
- evalscope/perf/datasets/longalpaca_12k.py +20 -0
- evalscope/perf/datasets/openqa.py +22 -0
- evalscope/perf/how_to_analysis_result.py +24 -0
- evalscope/perf/http_client.py +756 -0
- evalscope/perf/openai_api.py +130 -0
- evalscope/perf/plugin_registry.py +35 -0
- evalscope/perf/query_parameters.py +42 -0
- evalscope/perf/server_sent_event.py +43 -0
- evalscope/preprocess/__init__.py +1 -0
- evalscope/preprocess/tokenizers/__init__.py +0 -0
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
- evalscope/registry/__init__.py +1 -0
- evalscope/registry/tasks/arc.yaml +29 -0
- evalscope/registry/tasks/bbh.yaml +27 -0
- evalscope/registry/tasks/bbh_mini.yaml +27 -0
- evalscope/registry/tasks/ceval.yaml +27 -0
- evalscope/registry/tasks/ceval_mini.yaml +27 -0
- evalscope/registry/tasks/cmmlu.yaml +27 -0
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
- evalscope/registry/tasks/general_qa.yaml +27 -0
- evalscope/registry/tasks/gsm8k.yaml +29 -0
- evalscope/registry/tasks/mmlu.yaml +29 -0
- evalscope/registry/tasks/mmlu_mini.yaml +27 -0
- evalscope/run.py +404 -0
- evalscope/run_arena.py +204 -0
- evalscope/run_ms.py +140 -0
- evalscope/summarizer.py +144 -0
- evalscope/third_party/__init__.py +1 -0
- evalscope/third_party/toolbench_static/__init__.py +3 -0
- evalscope/third_party/toolbench_static/eval.py +219 -0
- evalscope/third_party/toolbench_static/infer.py +278 -0
- evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
- evalscope/tools/__init__.py +1 -0
- evalscope/tools/combine_reports.py +140 -0
- evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
- evalscope/tools/rewrite_eval_results.py +95 -0
- evalscope/utils/__init__.py +4 -0
- evalscope/utils/arena_utils.py +247 -0
- evalscope/utils/completion_parsers.py +87 -0
- evalscope/utils/logger.py +64 -0
- evalscope/utils/task_cfg_parser.py +10 -0
- evalscope/utils/task_utils.py +19 -0
- evalscope/utils/utils.py +625 -0
- evalscope/version.py +4 -0
- evalscope-0.5.0.dist-info/METADATA +566 -0
- evalscope-0.5.0.dist-info/RECORD +165 -0
- evalscope-0.5.0.dist-info/WHEEL +5 -0
- evalscope-0.5.0.dist-info/entry_points.txt +3 -0
- evalscope-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# Copyright (c) EleutherAI. and its affiliates.
|
|
3
|
+
# Copyright (c) OpenAI. and its affiliates.
|
|
4
|
+
import itertools
|
|
5
|
+
import math
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from typing import Dict, List, Union
|
|
9
|
+
from nltk.translate.bleu_score import sentence_bleu
|
|
10
|
+
from nltk import word_tokenize
|
|
11
|
+
import jieba
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import sacrebleu
|
|
15
|
+
import sklearn.metrics
|
|
16
|
+
import random
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def mean(arr):
|
|
20
|
+
return sum(arr) / len(arr)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def pop_stddev(arr):
|
|
24
|
+
mu = mean(arr)
|
|
25
|
+
return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def sample_stddev(arr):
|
|
29
|
+
mu = mean(arr)
|
|
30
|
+
return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def mean_stderr(arr):
|
|
34
|
+
return sample_stddev(arr) / math.sqrt(len(arr))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def median(arr):
|
|
38
|
+
return arr[len(arr) // 2]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def matthews_corrcoef(items):
|
|
42
|
+
unzipped_list = list(zip(*items))
|
|
43
|
+
golds = unzipped_list[0]
|
|
44
|
+
preds = unzipped_list[1]
|
|
45
|
+
return sklearn.metrics.matthews_corrcoef(golds, preds)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def f1_score(items):
|
|
49
|
+
unzipped_list = list(zip(*items))
|
|
50
|
+
golds = unzipped_list[0]
|
|
51
|
+
preds = unzipped_list[1]
|
|
52
|
+
fscore = sklearn.metrics.f1_score(golds, preds)
|
|
53
|
+
|
|
54
|
+
return np.max(fscore)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def acc_all(items):
|
|
58
|
+
# Only count as correct if all answers are labeled correctly for each question
|
|
59
|
+
question_scoring_dict = {}
|
|
60
|
+
preds = list(zip(*items))[0]
|
|
61
|
+
docs = list(zip(*items))[1]
|
|
62
|
+
|
|
63
|
+
for doc, pred in zip(docs, preds):
|
|
64
|
+
paragraph_id = doc['idx']['paragraph']
|
|
65
|
+
question_id = doc['idx']['question']
|
|
66
|
+
if (paragraph_id, question_id) not in question_scoring_dict:
|
|
67
|
+
question_scoring_dict[(paragraph_id, question_id)] = []
|
|
68
|
+
|
|
69
|
+
gold_label = doc['label'] == 1
|
|
70
|
+
|
|
71
|
+
question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
|
|
72
|
+
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
|
|
73
|
+
return acc
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def acc_all_stderr(items):
|
|
77
|
+
# Only count as correct if all answers are labeled correctly for each question
|
|
78
|
+
question_scoring_dict = {}
|
|
79
|
+
preds = list(zip(*items))[0]
|
|
80
|
+
docs = list(zip(*items))[1]
|
|
81
|
+
|
|
82
|
+
for doc, pred in zip(docs, preds):
|
|
83
|
+
question_id = doc['idx']['question']
|
|
84
|
+
if question_id not in question_scoring_dict:
|
|
85
|
+
question_scoring_dict[question_id] = []
|
|
86
|
+
|
|
87
|
+
gold_label = doc['label'] == 1
|
|
88
|
+
question_scoring_dict[question_id].append(gold_label == pred)
|
|
89
|
+
|
|
90
|
+
acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
|
|
91
|
+
return acc
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
|
|
95
|
+
"""Compute max metric between prediction and each ground truth."""
|
|
96
|
+
scores_for_ground_truths = []
|
|
97
|
+
for ground_truth in ground_truths:
|
|
98
|
+
score = metric_fn(prediction, ground_truth)
|
|
99
|
+
scores_for_ground_truths.append(score)
|
|
100
|
+
return max(scores_for_ground_truths)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def perplexity(items):
|
|
104
|
+
return math.exp(-mean(items))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def weighted_mean(items) -> float:
|
|
108
|
+
# e.g. [(0,1), (0.5,1), (1,1)]
|
|
109
|
+
a, b = zip(*items)
|
|
110
|
+
return sum(a) / sum(b)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def weighted_perplexity(items):
|
|
114
|
+
return math.exp(-weighted_mean(items))
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def bits_per_byte(items):
|
|
118
|
+
return -weighted_mean(items) / math.log(2)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def bleu(items):
|
|
122
|
+
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
|
|
123
|
+
for evaluating a generated sentence to a reference sentence. It counts matching
|
|
124
|
+
n-grams in the candidate translation to n-grams in the reference text, where
|
|
125
|
+
1-gram or unigram would be each token and a bigram comparison would be each
|
|
126
|
+
word pair. The comparison is made regardless of word order
|
|
127
|
+
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
|
|
128
|
+
Paper: https://www.aclweb.org/anthology/P02-1040/
|
|
129
|
+
|
|
130
|
+
Higher is better
|
|
131
|
+
"""
|
|
132
|
+
refs = list(zip(*items))[0]
|
|
133
|
+
preds = list(zip(*items))[1]
|
|
134
|
+
refs, preds = _sacreformat(refs, preds)
|
|
135
|
+
return sacrebleu.corpus_bleu(preds, refs).score
|
|
136
|
+
|
|
137
|
+
def bleu_ngram_one_sample(predict, reference):
|
|
138
|
+
"""
|
|
139
|
+
Calculate BLEU-1, BLEU-2, BLEU-3, and BLEU-4 scores
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
items: [(ref, pred)]
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
{
|
|
146
|
+
'bleu-1': 0.8,
|
|
147
|
+
'bleu-2': 0.45,
|
|
148
|
+
'bleu-3': 0.0,
|
|
149
|
+
'bleu-4': 0.0
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
"""
|
|
153
|
+
def is_contains_chinese(strs):
|
|
154
|
+
for _char in strs:
|
|
155
|
+
if '\u4e00' <= _char <= '\u9fa5':
|
|
156
|
+
return True
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
predict = list(jieba.cut(predict)) if is_contains_chinese(predict) else word_tokenize(predict)
|
|
160
|
+
reference = [list(jieba.cut(reference))] if is_contains_chinese(reference) else [word_tokenize(reference)]
|
|
161
|
+
|
|
162
|
+
result = dict()
|
|
163
|
+
result['bleu-1'] = sentence_bleu(reference, predict, weights=(1, 0, 0, 0))
|
|
164
|
+
result['bleu-2'] = sentence_bleu(reference, predict, weights=(0, 1, 0, 0))
|
|
165
|
+
result['bleu-3'] = sentence_bleu(reference, predict, weights=(0, 0, 1, 0))
|
|
166
|
+
result['bleu-4'] = sentence_bleu(reference, predict, weights=(0, 0, 0, 1))
|
|
167
|
+
|
|
168
|
+
return result
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def chrf(items):
|
|
172
|
+
"""chrF++ is a tool for automatic evaluation of machine translation output
|
|
173
|
+
based on character n-gram precision and recall enhanced with word n-grams.
|
|
174
|
+
Source: https://github.com/m-popovic/chrF
|
|
175
|
+
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
|
|
176
|
+
|
|
177
|
+
Higher is better # TODO I think
|
|
178
|
+
"""
|
|
179
|
+
refs = list(zip(*items))[0]
|
|
180
|
+
preds = list(zip(*items))[1]
|
|
181
|
+
refs, preds = _sacreformat(refs, preds)
|
|
182
|
+
return sacrebleu.corpus_chrf(preds, refs).score
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def ter(items):
|
|
186
|
+
"""Translation Error Rate is an error metric for machine translation that
|
|
187
|
+
measures the number of edits required to change a system output into one
|
|
188
|
+
of the references
|
|
189
|
+
Source: http://www.cs.umd.edu/~snover/tercom/
|
|
190
|
+
Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
|
|
191
|
+
|
|
192
|
+
Lower is better
|
|
193
|
+
"""
|
|
194
|
+
refs = list(zip(*items))[0]
|
|
195
|
+
preds = list(zip(*items))[1]
|
|
196
|
+
refs, preds = _sacreformat(refs, preds)
|
|
197
|
+
return sacrebleu.corpus_ter(preds, refs).score
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def is_non_str_iterable(obj):
|
|
201
|
+
return isinstance(obj, Iterable) and not isinstance(obj, str)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _sacreformat(refs, preds):
|
|
205
|
+
"""Format refs and preds for sacrebleu corpus calculation. It is very particular"""
|
|
206
|
+
# Sacrebleu expects (List[str], List[List[str])
|
|
207
|
+
# e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
|
|
208
|
+
|
|
209
|
+
# Note [ref1_stream] is the first reference for each pred.
|
|
210
|
+
# So lists are size N and (M, N) for N preds and M possible refs for each pred
|
|
211
|
+
# This is a different order of dimensions that I would expect
|
|
212
|
+
|
|
213
|
+
# We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
|
|
214
|
+
# Must become List[List[str]] with the inner list corresponding to preds
|
|
215
|
+
if not is_non_str_iterable(refs):
|
|
216
|
+
refs = list(refs)
|
|
217
|
+
if not is_non_str_iterable(refs[0]):
|
|
218
|
+
refs = [[ref] for ref in refs]
|
|
219
|
+
refs = list(zip(*refs))
|
|
220
|
+
# Note the number of refs in each ref list much match the number of preds
|
|
221
|
+
|
|
222
|
+
# We expect preds to be List[str] or List[List[str]]. Must become List[str]
|
|
223
|
+
if not is_non_str_iterable(preds):
|
|
224
|
+
preds = list(preds)
|
|
225
|
+
if is_non_str_iterable(preds[0]):
|
|
226
|
+
assert len(preds[0]) == 1, f'Pred must be a str, was {preds[0]}'
|
|
227
|
+
preds = [pred[0] for pred in preds]
|
|
228
|
+
|
|
229
|
+
return refs, preds
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class _bootstrap_internal:
|
|
233
|
+
def __init__(self, f, n):
|
|
234
|
+
self.f = f
|
|
235
|
+
self.n = n
|
|
236
|
+
|
|
237
|
+
def __call__(self, v):
|
|
238
|
+
i, xs = v
|
|
239
|
+
rnd = random.Random()
|
|
240
|
+
rnd.seed(i)
|
|
241
|
+
res = []
|
|
242
|
+
for _ in range(self.n):
|
|
243
|
+
res.append(self.f(rnd.choices(xs, k=len(xs))))
|
|
244
|
+
return res
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def bootstrap_stderr(f, xs, iters):
|
|
248
|
+
import multiprocessing as mp
|
|
249
|
+
|
|
250
|
+
pool = mp.Pool(mp.cpu_count())
|
|
251
|
+
# this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
|
|
252
|
+
# equivalent to stderr calculated without Bessel's correction in the stddev.
|
|
253
|
+
# Unfortunately, I haven't been able to figure out what the right correction is
|
|
254
|
+
# to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
|
|
255
|
+
# that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
|
|
256
|
+
# Thankfully, shouldn't matter because our samples are pretty big usually anyways
|
|
257
|
+
res = []
|
|
258
|
+
chunk_size = min(1000, iters)
|
|
259
|
+
from tqdm import tqdm
|
|
260
|
+
|
|
261
|
+
print('bootstrapping for stddev:', f.__name__)
|
|
262
|
+
for bootstrap in tqdm(
|
|
263
|
+
pool.imap(
|
|
264
|
+
_bootstrap_internal(f, chunk_size),
|
|
265
|
+
[(i, xs) for i in range(iters // chunk_size)],
|
|
266
|
+
),
|
|
267
|
+
total=iters // chunk_size,
|
|
268
|
+
):
|
|
269
|
+
# sample w replacement
|
|
270
|
+
res.extend(bootstrap)
|
|
271
|
+
|
|
272
|
+
pool.close()
|
|
273
|
+
return sample_stddev(res)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def stderr_for_metric(metric, bootstrap_iters):
|
|
277
|
+
bootstrappable = [
|
|
278
|
+
median,
|
|
279
|
+
matthews_corrcoef,
|
|
280
|
+
f1_score,
|
|
281
|
+
perplexity,
|
|
282
|
+
bleu,
|
|
283
|
+
chrf,
|
|
284
|
+
ter,
|
|
285
|
+
]
|
|
286
|
+
|
|
287
|
+
if metric in bootstrappable:
|
|
288
|
+
return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
|
|
289
|
+
|
|
290
|
+
stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
|
|
291
|
+
|
|
292
|
+
return stderr.get(metric, None)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def yesno(x):
|
|
296
|
+
if x:
|
|
297
|
+
return 'yes'
|
|
298
|
+
else:
|
|
299
|
+
return 'no'
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def compute_elo(battles,
|
|
303
|
+
col_model_a='model_a',
|
|
304
|
+
col_model_b='model_b',
|
|
305
|
+
col_win='win',
|
|
306
|
+
tie_values=['tie', 'tie (bothbad)'],
|
|
307
|
+
k=32,
|
|
308
|
+
scale=400,
|
|
309
|
+
base=10,
|
|
310
|
+
init_rating=1000):
|
|
311
|
+
rating = defaultdict(lambda: init_rating)
|
|
312
|
+
|
|
313
|
+
for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
|
|
314
|
+
ra = rating[model_a]
|
|
315
|
+
rb = rating[model_b]
|
|
316
|
+
ea = 1 / (1 + base**((rb - ra) / scale))
|
|
317
|
+
eb = 1 / (1 + base**((ra - rb) / scale))
|
|
318
|
+
if win == col_model_a:
|
|
319
|
+
sa = 1
|
|
320
|
+
elif win == col_model_b:
|
|
321
|
+
sa = 0
|
|
322
|
+
elif win in tie_values:
|
|
323
|
+
sa = 0.5
|
|
324
|
+
else:
|
|
325
|
+
raise Exception(f'unexpected vote {win}')
|
|
326
|
+
rating[model_a] += k * (sa - ea)
|
|
327
|
+
rating[model_b] += k * (1 - sa - eb)
|
|
328
|
+
|
|
329
|
+
return rating
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def exact_match(gold: str, pred: str) -> float:
|
|
333
|
+
if not pred:
|
|
334
|
+
return 0
|
|
335
|
+
|
|
336
|
+
return 1 if gold.strip() == pred.strip() else 0
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def calculate_arc_accuracy(question_answers: Dict[str, str], predictions: Dict[str, List[str]]) -> float:
|
|
340
|
+
"""
|
|
341
|
+
Calculate accuracy for ARC benchmark.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
question_answers: question_id -> answer mapping, e.g. {'abc_123': 'A'}
|
|
345
|
+
predictions: question_id -> prediction mapping, e.g. {'abc_123': ['D'], 'xyz_456': ['A', 'C']}
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
accuracy score (float)
|
|
349
|
+
|
|
350
|
+
Notes:
|
|
351
|
+
Each question is worth one point. Models are allowed to give multiple answers (e.g., "A;C"),
|
|
352
|
+
in which case the model receives 1/N points credit if one of its N answers is correct.
|
|
353
|
+
Refer to: https://leaderboard.allenai.org/arc/submissions/get-started
|
|
354
|
+
"""
|
|
355
|
+
score = 0.0
|
|
356
|
+
|
|
357
|
+
for question_id, answer in question_answers.items():
|
|
358
|
+
try:
|
|
359
|
+
predictions_for_q = predictions[question_id]
|
|
360
|
+
except Exception as e:
|
|
361
|
+
raise KeyError(f'Missing arc prediction: {e}')
|
|
362
|
+
|
|
363
|
+
if answer in predictions_for_q:
|
|
364
|
+
score += 1.0 / len(predictions_for_q)
|
|
365
|
+
|
|
366
|
+
del predictions[question_id]
|
|
367
|
+
|
|
368
|
+
if len(predictions) > 0:
|
|
369
|
+
log_ex: str = ', '.join(list(predictions.keys())[:3])
|
|
370
|
+
raise ValueError(f'Found {len(predictions)} extra predictions, for example: {log_ex}')
|
|
371
|
+
|
|
372
|
+
return score / len(question_answers)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def calculate_pass_at_k(
|
|
376
|
+
num_samples: Union[int, List[int], np.ndarray],
|
|
377
|
+
num_correct: Union[List[int], np.ndarray],
|
|
378
|
+
k: int = 1
|
|
379
|
+
) -> np.ndarray:
|
|
380
|
+
"""
|
|
381
|
+
Estimates pass@k of each problem and returns them in an array.
|
|
382
|
+
Examples:
|
|
383
|
+
>>> import numpy as np
|
|
384
|
+
>>> from typing import Union
|
|
385
|
+
>>> total = np.array([5, 5, 5])
|
|
386
|
+
>>> correct = np.array([2, 4, 2])
|
|
387
|
+
>>> calculate_pass_at_k(total, correct, 1)
|
|
388
|
+
result: "array([0.4, 0.8, 0.4])"
|
|
389
|
+
"""
|
|
390
|
+
|
|
391
|
+
def estimator(n: int, c: int, k: int) -> float:
|
|
392
|
+
"""
|
|
393
|
+
Calculates 1 - comb(n - c, k) / comb(n, k).
|
|
394
|
+
"""
|
|
395
|
+
if n - c < k:
|
|
396
|
+
return 1.0
|
|
397
|
+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
|
398
|
+
|
|
399
|
+
if isinstance(num_samples, int):
|
|
400
|
+
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
|
401
|
+
else:
|
|
402
|
+
assert len(num_samples) == len(num_correct)
|
|
403
|
+
num_samples_it = iter(num_samples)
|
|
404
|
+
|
|
405
|
+
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from statistics import mean
|
|
7
|
+
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from evalscope.constants import MetricsConstant
|
|
11
|
+
from evalscope.metrics.bundled_rouge_score import rouge_scorer
|
|
12
|
+
from evalscope.preprocess.tokenizers.gpt2_tokenizer import DummyTokenizer
|
|
13
|
+
from rouge_chinese import Rouge
|
|
14
|
+
import jieba
|
|
15
|
+
|
|
16
|
+
HERE = Path(__file__).absolute().parent
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
|
|
21
|
+
tokenizer=DummyTokenizer())
|
|
22
|
+
zh_scorer = Rouge()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def is_contains_chinese(strs):
|
|
26
|
+
for _char in strs:
|
|
27
|
+
if '\u4e00' <= _char <= '\u9fa5':
|
|
28
|
+
return True
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
def compute_rouge_score(predict_l, reference_l):
|
|
32
|
+
assert len(predict_l) == len(reference_l)
|
|
33
|
+
if len(predict_l) == 0:
|
|
34
|
+
tmp_d = dict()
|
|
35
|
+
for key in MetricsConstant.ROUGE_KEYS:
|
|
36
|
+
tmp_d[key] = 0
|
|
37
|
+
return tmp_d
|
|
38
|
+
|
|
39
|
+
result = defaultdict(list)
|
|
40
|
+
for p, r in tqdm(zip(predict_l, reference_l)):
|
|
41
|
+
one_sample = compute_rouge_score_one_sample(p, r)
|
|
42
|
+
for rouge_key in MetricsConstant.ROUGE_KEYS:
|
|
43
|
+
result[rouge_key].append(one_sample[rouge_key])
|
|
44
|
+
rlt = {}
|
|
45
|
+
for rouge_key in MetricsConstant.ROUGE_KEYS:
|
|
46
|
+
rlt[rouge_key] = mean(result[rouge_key]) * 100 if rouge_key in result \
|
|
47
|
+
else MetricsConstant.INVALID_VALUE
|
|
48
|
+
return rlt
|
|
49
|
+
|
|
50
|
+
def compute_rouge_score_one_sample_zh(predict, reference):
|
|
51
|
+
result = dict()
|
|
52
|
+
for p, r in zip(predict, reference):
|
|
53
|
+
p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
|
|
54
|
+
r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
|
|
55
|
+
|
|
56
|
+
score = zh_scorer.get_scores(p, r)[0]
|
|
57
|
+
result['rouge-1-r'] = score['rouge-1']['r']
|
|
58
|
+
result['rouge-1-p'] = score['rouge-1']['p']
|
|
59
|
+
result['rouge-1-f'] = score['rouge-1']['f']
|
|
60
|
+
result['rouge-2-r'] = score['rouge-2']['r']
|
|
61
|
+
result['rouge-2-p'] = score['rouge-2']['p']
|
|
62
|
+
result['rouge-2-f'] = score['rouge-2']['f']
|
|
63
|
+
result['rouge-l-r'] = score['rouge-l']['r']
|
|
64
|
+
result['rouge-l-p'] = score['rouge-l']['p']
|
|
65
|
+
result['rouge-l-f'] = score['rouge-l']['f']
|
|
66
|
+
|
|
67
|
+
return result
|
|
68
|
+
|
|
69
|
+
def compute_rouge_score_one_sample(predict, reference):
|
|
70
|
+
result = dict()
|
|
71
|
+
for p, r in zip(predict, reference):
|
|
72
|
+
score = scorer.score(p, r)
|
|
73
|
+
result['rouge-1-r'] = score['rouge1'].recall
|
|
74
|
+
result['rouge-1-p'] = score['rouge1'].precision
|
|
75
|
+
result['rouge-1-f'] = score['rouge1'].fmeasure
|
|
76
|
+
result['rouge-2-r'] = score['rouge2'].recall
|
|
77
|
+
result['rouge-2-p'] = score['rouge2'].precision
|
|
78
|
+
result['rouge-2-f'] = score['rouge2'].fmeasure
|
|
79
|
+
result['rouge-l-r'] = score['rougeL'].recall
|
|
80
|
+
result['rouge-l-p'] = score['rougeL'].precision
|
|
81
|
+
result['rouge-l-f'] = score['rougeL'].fmeasure
|
|
82
|
+
|
|
83
|
+
return result
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _to_table(final_result) -> str:
|
|
87
|
+
table = []
|
|
88
|
+
# step 1. table header
|
|
89
|
+
all_tasks = ['', 'total']
|
|
90
|
+
all_tasks.extend(final_result['all_tasks'].split(','))
|
|
91
|
+
table.append('\t'.join(all_tasks))
|
|
92
|
+
|
|
93
|
+
# step 2. table row
|
|
94
|
+
for rouge_key in MetricsConstant.ROUGE_KEYS:
|
|
95
|
+
row = [rouge_key]
|
|
96
|
+
for task in all_tasks:
|
|
97
|
+
if not task:
|
|
98
|
+
continue
|
|
99
|
+
elif task == 'total':
|
|
100
|
+
row.append(
|
|
101
|
+
f'{final_result["total"]["rouge"][rouge_key] :0.2f}')
|
|
102
|
+
else:
|
|
103
|
+
row.append(
|
|
104
|
+
f'{final_result["tasks"][task]["rouge"][rouge_key] :0.2f}')
|
|
105
|
+
table.append('\t'.join(row))
|
|
106
|
+
|
|
107
|
+
return '\n'.join(table)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def run_rouge_eval(data_l, md_level=2, report_metric_key='rouge-l-f'):
|
|
111
|
+
print(f"{'#' * md_level} Rouge Eval")
|
|
112
|
+
for data in tqdm(data_l):
|
|
113
|
+
data['rouge'] = compute_rouge_score_one_sample(
|
|
114
|
+
data['gen_tok_str'], data['reference_tok_str'])
|
|
115
|
+
task_data_d = defaultdict(list)
|
|
116
|
+
for data in data_l:
|
|
117
|
+
for task in data['task_tags']:
|
|
118
|
+
task_data_d[task].append(data)
|
|
119
|
+
|
|
120
|
+
total_rouge = mean([data['rouge'][report_metric_key] for data in data_l])
|
|
121
|
+
print(f'[total], count: {len(data_l)}, {report_metric_key}: '
|
|
122
|
+
f'{total_rouge * 100:0.2f}%')
|
|
123
|
+
|
|
124
|
+
for task, task_data in task_data_d.items():
|
|
125
|
+
task_rouge = mean(
|
|
126
|
+
[data['rouge'][report_metric_key] for data in task_data])
|
|
127
|
+
print(
|
|
128
|
+
f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
|
|
129
|
+
f'{task_rouge * 100:0.2f}%')
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any, Union, Dict, List
|
|
4
|
+
import torch
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CustomModel(ABC):
|
|
8
|
+
|
|
9
|
+
def __init__(self, config: dict, **kwargs):
|
|
10
|
+
self.config = config
|
|
11
|
+
self.kwargs = kwargs
|
|
12
|
+
|
|
13
|
+
if config.get('model_id', None) is None:
|
|
14
|
+
raise ValueError(f"**Error: model_id is required in config for CustomModel. Got config: {config}")
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
@torch.no_grad()
|
|
18
|
+
def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
|
|
19
|
+
"""
|
|
20
|
+
Model prediction function for batch inputs.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
prompts (str): The input batch of prompts to predict.
|
|
24
|
+
|
|
25
|
+
**kwargs: kwargs
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
res (dict): The model prediction results (batch). Format:
|
|
29
|
+
[
|
|
30
|
+
{
|
|
31
|
+
'choices': [
|
|
32
|
+
{
|
|
33
|
+
'index': 0,
|
|
34
|
+
'message': {
|
|
35
|
+
'content': 'xxx',
|
|
36
|
+
'role': 'assistant'
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
'created': 1677664795,
|
|
41
|
+
'model': 'gpt-3.5-turbo-0613', # should be model_id
|
|
42
|
+
'object': 'chat.completion',
|
|
43
|
+
'usage': {
|
|
44
|
+
'completion_tokens': 17,
|
|
45
|
+
'prompt_tokens': 57,
|
|
46
|
+
'total_tokens': 74
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
,
|
|
50
|
+
...
|
|
51
|
+
]
|
|
52
|
+
"""
|
|
53
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
5
|
+
from evalscope.models import ChatBaseModel
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DummyChatModel(ChatBaseModel):
|
|
12
|
+
|
|
13
|
+
MODEL_ID = 'dummy_chat_model_0801'
|
|
14
|
+
REVISION = 'v1.0.0'
|
|
15
|
+
|
|
16
|
+
def __init__(self, model_cfg: dict, **kwargs):
|
|
17
|
+
model_cfg['model_id'] = self.MODEL_ID
|
|
18
|
+
model_cfg['revision'] = self.REVISION
|
|
19
|
+
super(DummyChatModel, self).__init__(model_cfg=model_cfg)
|
|
20
|
+
|
|
21
|
+
def predict(self, inputs: dict, **kwargs) -> dict:
|
|
22
|
+
|
|
23
|
+
debug: bool = False
|
|
24
|
+
if debug:
|
|
25
|
+
messages = inputs['messages']
|
|
26
|
+
history = inputs['history']
|
|
27
|
+
|
|
28
|
+
logger.info(f'** messages: {messages}')
|
|
29
|
+
logger.info(f'** history: {history}')
|
|
30
|
+
|
|
31
|
+
choice = random.choice(['A', 'B', 'C', 'D'])
|
|
32
|
+
|
|
33
|
+
# Build response
|
|
34
|
+
res = {
|
|
35
|
+
'choices': [
|
|
36
|
+
{
|
|
37
|
+
'index': 0,
|
|
38
|
+
'message': {
|
|
39
|
+
'content': choice,
|
|
40
|
+
'role': 'assistant'
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
'created': time.time(),
|
|
45
|
+
'model': self.MODEL_ID + '-' + self.REVISION,
|
|
46
|
+
'object': 'chat.completion',
|
|
47
|
+
'usage': {}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return res
|