evalscope 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +3 -0
- evalscope/backend/__init__.py +3 -0
- evalscope/backend/base.py +27 -0
- evalscope/backend/opencompass/__init__.py +3 -0
- evalscope/backend/opencompass/api_meta_template.py +64 -0
- evalscope/backend/opencompass/backend_manager.py +247 -0
- evalscope/backend/opencompass/tasks/__init__.py +1 -0
- evalscope/backend/opencompass/tasks/eval_api.py +30 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
- evalscope/backend/vlm_eval_kit/__init__.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
- evalscope/benchmarks/__init__.py +4 -0
- evalscope/benchmarks/arc/__init__.py +5 -0
- evalscope/benchmarks/arc/ai2_arc.py +148 -0
- evalscope/benchmarks/arc/arc_adapter.py +231 -0
- evalscope/benchmarks/bbh/__init__.py +6 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
- evalscope/benchmarks/benchmark.py +65 -0
- evalscope/benchmarks/ceval/__init__.py +5 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
- evalscope/benchmarks/ceval/ceval_exam.py +159 -0
- evalscope/benchmarks/cmmlu/__init__.py +5 -0
- evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
- evalscope/benchmarks/competition_math/__init__.py +5 -0
- evalscope/benchmarks/competition_math/competition_math.py +88 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
- evalscope/benchmarks/data_adapter.py +263 -0
- evalscope/benchmarks/general_qa/__init__.py +5 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
- evalscope/benchmarks/gsm8k/__init__.py +5 -0
- evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
- evalscope/benchmarks/hellaswag/__init__.py +5 -0
- evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
- evalscope/benchmarks/humaneval/__init__.py +5 -0
- evalscope/benchmarks/humaneval/humaneval.py +82 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
- evalscope/benchmarks/mmlu/__init__.py +5 -0
- evalscope/benchmarks/mmlu/mmlu.py +174 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
- evalscope/benchmarks/race/__init__.py +5 -0
- evalscope/benchmarks/race/race.py +118 -0
- evalscope/benchmarks/race/race_adapter.py +229 -0
- evalscope/benchmarks/trivia_qa/__init__.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
- evalscope/benchmarks/truthful_qa/__init__.py +5 -0
- evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
- evalscope/cache.py +98 -0
- evalscope/cli/__init__.py +1 -0
- evalscope/cli/base.py +20 -0
- evalscope/cli/cli.py +26 -0
- evalscope/cli/start_perf.py +37 -0
- evalscope/cli/start_server.py +138 -0
- evalscope/config.py +165 -0
- evalscope/constants.py +150 -0
- evalscope/evaluator/__init__.py +3 -0
- evalscope/evaluator/evaluator.py +689 -0
- evalscope/evaluator/rating_eval.py +178 -0
- evalscope/evaluator/reviewer/__init__.py +1 -0
- evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
- evalscope/metrics/__init__.py +1 -0
- evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
- evalscope/metrics/code_metric.py +104 -0
- evalscope/metrics/math_accuracy.py +60 -0
- evalscope/metrics/metrics.py +405 -0
- evalscope/metrics/rouge_metric.py +129 -0
- evalscope/models/__init__.py +4 -0
- evalscope/models/custom/__init__.py +4 -0
- evalscope/models/custom/custom_model.py +53 -0
- evalscope/models/dummy_chat_model.py +50 -0
- evalscope/models/model.py +88 -0
- evalscope/models/model_adapter.py +586 -0
- evalscope/models/openai_model.py +103 -0
- evalscope/models/template.py +1446 -0
- evalscope/perf/__init__.py +0 -0
- evalscope/perf/_logging.py +32 -0
- evalscope/perf/api_plugin_base.py +60 -0
- evalscope/perf/custom_api.py +87 -0
- evalscope/perf/dashscope_api.py +84 -0
- evalscope/perf/dataset_plugin_base.py +64 -0
- evalscope/perf/datasets/__init__.py +0 -0
- evalscope/perf/datasets/line_by_line.py +18 -0
- evalscope/perf/datasets/longalpaca_12k.py +20 -0
- evalscope/perf/datasets/openqa.py +22 -0
- evalscope/perf/how_to_analysis_result.py +24 -0
- evalscope/perf/http_client.py +756 -0
- evalscope/perf/openai_api.py +130 -0
- evalscope/perf/plugin_registry.py +35 -0
- evalscope/perf/query_parameters.py +42 -0
- evalscope/perf/server_sent_event.py +43 -0
- evalscope/preprocess/__init__.py +1 -0
- evalscope/preprocess/tokenizers/__init__.py +0 -0
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
- evalscope/registry/__init__.py +1 -0
- evalscope/registry/tasks/arc.yaml +29 -0
- evalscope/registry/tasks/bbh.yaml +27 -0
- evalscope/registry/tasks/bbh_mini.yaml +27 -0
- evalscope/registry/tasks/ceval.yaml +27 -0
- evalscope/registry/tasks/ceval_mini.yaml +27 -0
- evalscope/registry/tasks/cmmlu.yaml +27 -0
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
- evalscope/registry/tasks/general_qa.yaml +27 -0
- evalscope/registry/tasks/gsm8k.yaml +29 -0
- evalscope/registry/tasks/mmlu.yaml +29 -0
- evalscope/registry/tasks/mmlu_mini.yaml +27 -0
- evalscope/run.py +404 -0
- evalscope/run_arena.py +204 -0
- evalscope/run_ms.py +140 -0
- evalscope/summarizer.py +144 -0
- evalscope/third_party/__init__.py +1 -0
- evalscope/third_party/toolbench_static/__init__.py +3 -0
- evalscope/third_party/toolbench_static/eval.py +219 -0
- evalscope/third_party/toolbench_static/infer.py +278 -0
- evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
- evalscope/tools/__init__.py +1 -0
- evalscope/tools/combine_reports.py +140 -0
- evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
- evalscope/tools/rewrite_eval_results.py +95 -0
- evalscope/utils/__init__.py +4 -0
- evalscope/utils/arena_utils.py +247 -0
- evalscope/utils/completion_parsers.py +87 -0
- evalscope/utils/logger.py +64 -0
- evalscope/utils/task_cfg_parser.py +10 -0
- evalscope/utils/task_utils.py +19 -0
- evalscope/utils/utils.py +625 -0
- evalscope/version.py +4 -0
- evalscope-0.5.0.dist-info/METADATA +566 -0
- evalscope-0.5.0.dist-info/RECORD +165 -0
- evalscope-0.5.0.dist-info/WHEEL +5 -0
- evalscope-0.5.0.dist-info/entry_points.txt +3 -0
- evalscope-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
import glob
|
|
6
|
+
from tabulate import tabulate
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
Combine and generate table for reports of LLMs.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_report(report_file: str):
|
|
17
|
+
data_d: dict = json.load(open(report_file, 'r'))
|
|
18
|
+
dataset_name = data_d['name']
|
|
19
|
+
score = data_d['score'] # float or dict
|
|
20
|
+
score_d = {}
|
|
21
|
+
if isinstance(score, dict):
|
|
22
|
+
# score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
|
|
23
|
+
score_d = score
|
|
24
|
+
elif isinstance(score, float):
|
|
25
|
+
# score_d['acc'] = round(score, 4) * 100
|
|
26
|
+
score_d['acc'] = score
|
|
27
|
+
else:
|
|
28
|
+
raise ValueError(f'Unknown score type: {type(score)}')
|
|
29
|
+
# score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
|
|
30
|
+
score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
|
|
31
|
+
|
|
32
|
+
return {'dataset_name': dataset_name, 'score': score_str}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_model_reports(model_report_dir: str):
|
|
36
|
+
model_report_dir = os.path.normpath(model_report_dir)
|
|
37
|
+
model_report_dir = model_report_dir.rstrip('reports')
|
|
38
|
+
model_info = os.path.basename(os.path.normpath(model_report_dir))
|
|
39
|
+
model_name = '_'.join(model_info.split('_')[:-1][3:])
|
|
40
|
+
report_files = glob.glob(os.path.join(model_report_dir, 'reports', '*.json'))
|
|
41
|
+
|
|
42
|
+
model_reports_d = {model_name: []}
|
|
43
|
+
for file_path in report_files:
|
|
44
|
+
report_d = get_report(file_path)
|
|
45
|
+
model_reports_d[model_name].append(report_d)
|
|
46
|
+
|
|
47
|
+
return model_reports_d
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def gen_table(reports_path_list: list):
|
|
51
|
+
table_values = []
|
|
52
|
+
headers = ['Model']
|
|
53
|
+
is_headers_set = False
|
|
54
|
+
|
|
55
|
+
for report_path in reports_path_list:
|
|
56
|
+
model_reports_d = get_model_reports(report_path)
|
|
57
|
+
for model_name, report_list in model_reports_d.items():
|
|
58
|
+
# report_list: [{'dataset_name': 'CompetitionMath', 'score': '4.42 (acc)'},
|
|
59
|
+
# {'dataset_name': 'GSM8K', 'score': '28.51 (acc)'}]
|
|
60
|
+
report_list = sorted(report_list, key=lambda x: x['dataset_name'])
|
|
61
|
+
if not is_headers_set:
|
|
62
|
+
headers.extend([x['dataset_name'] for x in report_list])
|
|
63
|
+
is_headers_set = True
|
|
64
|
+
single_row = []
|
|
65
|
+
single_row.append(model_name)
|
|
66
|
+
for single_report in report_list:
|
|
67
|
+
# e.g. '28.51 (acc)'
|
|
68
|
+
single_row.append(single_report['score'])
|
|
69
|
+
table_values.append(single_row)
|
|
70
|
+
|
|
71
|
+
report_table = tabulate(table_values, headers=headers, tablefmt='grid')
|
|
72
|
+
return report_table
|
|
73
|
+
|
|
74
|
+
class ReportsRecorder:
|
|
75
|
+
COMMON_DATASET_PATH = []
|
|
76
|
+
CUSTOM_DATASET_PATH = []
|
|
77
|
+
|
|
78
|
+
def __init__(self, oss_url: str = "", endpoint: str = ""):
|
|
79
|
+
if oss_url and endpoint:
|
|
80
|
+
import oss2
|
|
81
|
+
from oss2.credentials import EnvironmentVariableCredentialsProvider
|
|
82
|
+
|
|
83
|
+
auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
|
|
84
|
+
oss_url = oss_url.replace("oss://", "").split('/')
|
|
85
|
+
bucket_name = oss_url[0]
|
|
86
|
+
|
|
87
|
+
self.object_path = "/".join(oss_url[1:])
|
|
88
|
+
self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
|
|
89
|
+
else:
|
|
90
|
+
self.object_path = ""
|
|
91
|
+
self.bucket = None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def append_path(self, report_path: str, dataset_name: str):
|
|
95
|
+
if dataset_name == "general_qa":
|
|
96
|
+
self.CUSTOM_DATASET_PATH.append(report_path)
|
|
97
|
+
else:
|
|
98
|
+
self.COMMON_DATASET_PATH.append(report_path)
|
|
99
|
+
|
|
100
|
+
def dump_reports(self, output_dir: str):
|
|
101
|
+
result = {
|
|
102
|
+
"CommonDataset": [],
|
|
103
|
+
"CustomDataset": []
|
|
104
|
+
}
|
|
105
|
+
for line in self.COMMON_DATASET_PATH:
|
|
106
|
+
with open(line, 'r') as f:
|
|
107
|
+
report = json.load(f)
|
|
108
|
+
result['CommonDataset'].append(report)
|
|
109
|
+
for line in self.CUSTOM_DATASET_PATH:
|
|
110
|
+
with open(line, 'r') as f:
|
|
111
|
+
report = json.load(f)
|
|
112
|
+
report.update({"name": os.path.basename(line)})
|
|
113
|
+
result['CustomDataset'].append(report)
|
|
114
|
+
|
|
115
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
116
|
+
output_file_name = "metric.json"
|
|
117
|
+
output_path = os.path.join(output_dir, output_file_name)
|
|
118
|
+
with open(output_path, 'w+') as f:
|
|
119
|
+
f.write(json.dumps(result, ensure_ascii=False, indent=4))
|
|
120
|
+
|
|
121
|
+
if self.bucket:
|
|
122
|
+
remote_path = os.path.join(self.object_path, output_file_name)
|
|
123
|
+
logger.info(f"** Upload report to oss: {remote_path}")
|
|
124
|
+
self.bucket.put_object_from_file(remote_path, output_path)
|
|
125
|
+
|
|
126
|
+
if __name__ == '__main__':
|
|
127
|
+
report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
|
|
128
|
+
report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
|
|
129
|
+
|
|
130
|
+
report_table = gen_table([report_dir_1, report_dir_2])
|
|
131
|
+
print(report_table)
|
|
132
|
+
|
|
133
|
+
# ALL VALUES ONLY FOR EXAMPLE
|
|
134
|
+
# +--------------------------+-------------------+-------------+
|
|
135
|
+
# | Model | CompetitionMath | GSM8K |
|
|
136
|
+
# +==========================+===================+=============+
|
|
137
|
+
# | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
|
|
138
|
+
# +--------------------------+-------------------+-------------+
|
|
139
|
+
# | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
|
|
140
|
+
# +--------------------------+-------------------+-------------+
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
# Note: refer to https://github.com/hendrycks/test/blob/master/categories.py
|
|
4
|
+
|
|
5
|
+
subcategories = {
|
|
6
|
+
'abstract_algebra': ['math'],
|
|
7
|
+
'anatomy': ['health'],
|
|
8
|
+
'astronomy': ['physics'],
|
|
9
|
+
'business_ethics': ['business'],
|
|
10
|
+
'clinical_knowledge': ['health'],
|
|
11
|
+
'college_biology': ['biology'],
|
|
12
|
+
'college_chemistry': ['chemistry'],
|
|
13
|
+
'college_computer_science': ['computer science'],
|
|
14
|
+
'college_mathematics': ['math'],
|
|
15
|
+
'college_medicine': ['health'],
|
|
16
|
+
'college_physics': ['physics'],
|
|
17
|
+
'computer_security': ['computer science'],
|
|
18
|
+
'conceptual_physics': ['physics'],
|
|
19
|
+
'econometrics': ['economics'],
|
|
20
|
+
'electrical_engineering': ['engineering'],
|
|
21
|
+
'elementary_mathematics': ['math'],
|
|
22
|
+
'formal_logic': ['philosophy'],
|
|
23
|
+
'global_facts': ['other'],
|
|
24
|
+
'high_school_biology': ['biology'],
|
|
25
|
+
'high_school_chemistry': ['chemistry'],
|
|
26
|
+
'high_school_computer_science': ['computer science'],
|
|
27
|
+
'high_school_european_history': ['history'],
|
|
28
|
+
'high_school_geography': ['geography'],
|
|
29
|
+
'high_school_government_and_politics': ['politics'],
|
|
30
|
+
'high_school_macroeconomics': ['economics'],
|
|
31
|
+
'high_school_mathematics': ['math'],
|
|
32
|
+
'high_school_microeconomics': ['economics'],
|
|
33
|
+
'high_school_physics': ['physics'],
|
|
34
|
+
'high_school_psychology': ['psychology'],
|
|
35
|
+
'high_school_statistics': ['math'],
|
|
36
|
+
'high_school_us_history': ['history'],
|
|
37
|
+
'high_school_world_history': ['history'],
|
|
38
|
+
'human_aging': ['health'],
|
|
39
|
+
'human_sexuality': ['culture'],
|
|
40
|
+
'international_law': ['law'],
|
|
41
|
+
'jurisprudence': ['law'],
|
|
42
|
+
'logical_fallacies': ['philosophy'],
|
|
43
|
+
'machine_learning': ['computer science'],
|
|
44
|
+
'management': ['business'],
|
|
45
|
+
'marketing': ['business'],
|
|
46
|
+
'medical_genetics': ['health'],
|
|
47
|
+
'miscellaneous': ['other'],
|
|
48
|
+
'moral_disputes': ['philosophy'],
|
|
49
|
+
'moral_scenarios': ['philosophy'],
|
|
50
|
+
'nutrition': ['health'],
|
|
51
|
+
'philosophy': ['philosophy'],
|
|
52
|
+
'prehistory': ['history'],
|
|
53
|
+
'professional_accounting': ['other'],
|
|
54
|
+
'professional_law': ['law'],
|
|
55
|
+
'professional_medicine': ['health'],
|
|
56
|
+
'professional_psychology': ['psychology'],
|
|
57
|
+
'public_relations': ['politics'],
|
|
58
|
+
'security_studies': ['politics'],
|
|
59
|
+
'sociology': ['culture'],
|
|
60
|
+
'us_foreign_policy': ['politics'],
|
|
61
|
+
'virology': ['health'],
|
|
62
|
+
'world_religions': ['philosophy'],
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
categories = {
|
|
66
|
+
'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
|
|
67
|
+
'Humanities': ['history', 'philosophy', 'law'],
|
|
68
|
+
'Social Science': ['politics', 'culture', 'economics', 'geography', 'psychology'],
|
|
69
|
+
'Other': ['other', 'business', 'health'],
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def main():
|
|
74
|
+
|
|
75
|
+
reversed_categories = {}
|
|
76
|
+
for category, subcategory_list in categories.items():
|
|
77
|
+
for subcategory in subcategory_list:
|
|
78
|
+
reversed_categories[subcategory] = category
|
|
79
|
+
|
|
80
|
+
subject_mapping = {}
|
|
81
|
+
for subject, subcategory_list in subcategories.items():
|
|
82
|
+
category_name: str = reversed_categories[subcategory_list[0]]
|
|
83
|
+
subject_show_name: str = ' '.join([item.capitalize() for item in subject.split('_')])
|
|
84
|
+
subject_mapping[subject] = [subject_show_name, subcategory_list[0], category_name]
|
|
85
|
+
|
|
86
|
+
print(subject_mapping)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == '__main__':
|
|
90
|
+
main()
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
from evalscope.models.custom import CustomModel
|
|
6
|
+
from evalscope.run import run_task
|
|
7
|
+
from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
|
|
8
|
+
from evalscope.utils import yaml_to_dict
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
This script is used to rewrite the evaluation results without re-running the model predictions.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DummyCustomModel(CustomModel):
|
|
19
|
+
|
|
20
|
+
def __init__(self, config: dict, **kwargs):
|
|
21
|
+
super(DummyCustomModel, self).__init__(config=config, **kwargs)
|
|
22
|
+
|
|
23
|
+
def predict(self, prompts: str, **kwargs):
|
|
24
|
+
# ONLY FOR DUMMY IMPLEMENTATION, DO NOT EDIT OR USE IN PRODUCTION.
|
|
25
|
+
|
|
26
|
+
response = 'The answer is C. NOTE: ONLY FOR TEST'
|
|
27
|
+
|
|
28
|
+
res_d: dict = {
|
|
29
|
+
'choices': [
|
|
30
|
+
{
|
|
31
|
+
'index': 0,
|
|
32
|
+
'message': {
|
|
33
|
+
# 'content': f'The answer is B. Raw prompt: {prompt}',
|
|
34
|
+
'content': response,
|
|
35
|
+
'role': 'assistant'
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
],
|
|
39
|
+
'created': time.time(),
|
|
40
|
+
'model': self.config.get('model_id'), # should be model_id
|
|
41
|
+
'object': 'chat.completion',
|
|
42
|
+
'usage': {
|
|
43
|
+
'completion_tokens': 0,
|
|
44
|
+
'prompt_tokens': 0,
|
|
45
|
+
'total_tokens': 0
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return [res_d for _ in prompts]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_task_cfg(cfg_file: str, model_instance: CustomModel):
|
|
53
|
+
if cfg_file:
|
|
54
|
+
cfg_file: str = os.path.abspath(cfg_file)
|
|
55
|
+
logger.info(f'Loading task config from {cfg_file}')
|
|
56
|
+
task_cfg_d: dict = yaml_to_dict(yaml_file=cfg_file)
|
|
57
|
+
task_cfg_d.update({'model': model_instance})
|
|
58
|
+
logger.info(f'**Task config: {task_cfg_d}')
|
|
59
|
+
else:
|
|
60
|
+
# 默认config 示例
|
|
61
|
+
task_cfg_d = {
|
|
62
|
+
'model_args': {},
|
|
63
|
+
'generation_config': {},
|
|
64
|
+
'dataset_args': {},
|
|
65
|
+
'dry_run': False,
|
|
66
|
+
'model': model_instance, # NOTE: model_id or # model_dir or model_instance(CustomModel)
|
|
67
|
+
'eval_type': 'custom', # NOTE: `checkpoint` or `custom` or `service`
|
|
68
|
+
'datasets': ['arc'],
|
|
69
|
+
'work_dir': DEFAULT_ROOT_CACHE_DIR,
|
|
70
|
+
'outputs': './outputs/eval_swift_dummy',
|
|
71
|
+
'mem_cache': False,
|
|
72
|
+
'dataset_hub': 'ModelScope',
|
|
73
|
+
'dataset_dir': DEFAULT_ROOT_CACHE_DIR,
|
|
74
|
+
'stage': 'all',
|
|
75
|
+
'limit': 10,
|
|
76
|
+
'debug': False
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return task_cfg_d
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == '__main__':
|
|
83
|
+
# step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
|
|
84
|
+
# step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
|
|
85
|
+
|
|
86
|
+
swift_model = DummyCustomModel(config={'model_id': 'swift-model-dummy'})
|
|
87
|
+
|
|
88
|
+
task_cfg_file = '/path/to/eval_your_model_results/configs/task_output_config.yaml'
|
|
89
|
+
|
|
90
|
+
task_cfg_d = yaml_to_dict(task_cfg_file)
|
|
91
|
+
task_cfg_d.update({'model': swift_model})
|
|
92
|
+
|
|
93
|
+
eval_results: dict = run_task(task_cfg=task_cfg_d)
|
|
94
|
+
print(f'** Evaluation results finished !\n')
|
|
95
|
+
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# Copyright (c) lmsys.org.
|
|
3
|
+
|
|
4
|
+
import random
|
|
5
|
+
from collections import OrderedDict, defaultdict
|
|
6
|
+
from typing import List, Sequence, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def compute_elo(battles,
|
|
18
|
+
col_model_a='model_a',
|
|
19
|
+
col_model_b='model_b',
|
|
20
|
+
col_win='win',
|
|
21
|
+
tie_values=['tie', 'tie (bothbad)'],
|
|
22
|
+
k=32,
|
|
23
|
+
scale=400,
|
|
24
|
+
base=10,
|
|
25
|
+
init_rating=1000):
|
|
26
|
+
rating = defaultdict(lambda: init_rating)
|
|
27
|
+
|
|
28
|
+
for rd, model_a, model_b, win in battles[[
|
|
29
|
+
col_model_a, col_model_b, col_win
|
|
30
|
+
]].itertuples():
|
|
31
|
+
ra = rating[model_a]
|
|
32
|
+
rb = rating[model_b]
|
|
33
|
+
ea = 1 / (1 + base**((rb - ra) / scale))
|
|
34
|
+
eb = 1 / (1 + base**((ra - rb) / scale))
|
|
35
|
+
if win == col_model_a:
|
|
36
|
+
sa = 1
|
|
37
|
+
elif win == col_model_b:
|
|
38
|
+
sa = 0
|
|
39
|
+
elif win in tie_values:
|
|
40
|
+
sa = 0.5
|
|
41
|
+
else:
|
|
42
|
+
raise Exception(f'unexpected vote {win}')
|
|
43
|
+
rating[model_a] += k * (sa - ea)
|
|
44
|
+
rating[model_b] += k * (1 - sa - eb)
|
|
45
|
+
|
|
46
|
+
return rating
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def merge_ques_ans(answer_list_all,
|
|
50
|
+
merge_key: str = 'question_id',
|
|
51
|
+
merge_mode: str = 'inner') -> pd.DataFrame:
|
|
52
|
+
"""
|
|
53
|
+
Merge question and answer list to unifiled data.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
answer_list_all: list of answer list,
|
|
57
|
+
e.g. [ans1_list, ans2_list, ...], an ans_list is predicted answers
|
|
58
|
+
of a specific model, must contain following columns: 'question_id',
|
|
59
|
+
'text', 'category', 'model_id', 'answer'
|
|
60
|
+
merge_key: key for dataframe merging
|
|
61
|
+
merge_mode: mode for dataframe merging,
|
|
62
|
+
e.g. 'inner', 'left', 'right', 'outer'
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
pandas DataFrame: merged dataframe, e.g. columns are
|
|
66
|
+
['question_id', 'gpt-3.5-turbo', 'llama2-7b']
|
|
67
|
+
"""
|
|
68
|
+
ans_df = pd.DataFrame()
|
|
69
|
+
for ans_list in answer_list_all:
|
|
70
|
+
ans_list = [{
|
|
71
|
+
'question_id': item['question_id'],
|
|
72
|
+
item['model_id']: item
|
|
73
|
+
} for item in ans_list]
|
|
74
|
+
if ans_df.empty:
|
|
75
|
+
ans_df = pa.Table.from_pylist(ans_list).to_pandas()
|
|
76
|
+
else:
|
|
77
|
+
ans_df = pd.merge(
|
|
78
|
+
ans_df,
|
|
79
|
+
pa.Table.from_pylist(ans_list).to_pandas(),
|
|
80
|
+
on=merge_key,
|
|
81
|
+
how=merge_mode)
|
|
82
|
+
|
|
83
|
+
return ans_df
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_battle_pairs(columns: List[str], baseline_idx: int = -1) -> List[tuple]:
|
|
87
|
+
"""
|
|
88
|
+
Get battle pair names from columns.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
columns: list of column names.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
list of battle pairs.
|
|
95
|
+
|
|
96
|
+
Example:
|
|
97
|
+
>>> columns = ['A', 'B', 'C']
|
|
98
|
+
>>> res = get_battle_pairs(columns)
|
|
99
|
+
>>> print(res)
|
|
100
|
+
>>> [('B', 'A'), ('C', 'A'), ('C', 'B')]
|
|
101
|
+
|
|
102
|
+
>>> columns = ['A', 'B', 'C']
|
|
103
|
+
>>> res = get_battle_pairs(columns, 2)
|
|
104
|
+
>>> print(res)
|
|
105
|
+
>>> [('A', 'C'), ('B', 'C')]
|
|
106
|
+
"""
|
|
107
|
+
res_list = []
|
|
108
|
+
|
|
109
|
+
cols_num = len(columns)
|
|
110
|
+
if cols_num <= 0:
|
|
111
|
+
return res_list
|
|
112
|
+
|
|
113
|
+
if baseline_idx != -1:
|
|
114
|
+
n_column = columns[baseline_idx]
|
|
115
|
+
res_list = [(column, n_column) for column in columns
|
|
116
|
+
if column != n_column]
|
|
117
|
+
else:
|
|
118
|
+
mat = np.ones((cols_num, cols_num))
|
|
119
|
+
mat_lower_tril = np.tril(mat, k=-1)
|
|
120
|
+
x_ids, y_ids = np.where(mat_lower_tril == 1)
|
|
121
|
+
res_list = [(columns[x_id], columns[y_id])
|
|
122
|
+
for x_id, y_id in zip(x_ids, y_ids)]
|
|
123
|
+
|
|
124
|
+
return res_list
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def get_battle_pairs_origin(columns: List[str],
|
|
128
|
+
compare_base: bool = False,
|
|
129
|
+
swap: bool = False): # TODO: to refactor
|
|
130
|
+
"""
|
|
131
|
+
Get battle pair names from columns.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
columns: list of column names.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
list of battle pairs.
|
|
138
|
+
|
|
139
|
+
Example:
|
|
140
|
+
>>> columns = ['A', 'B', 'C']
|
|
141
|
+
>>> res = get_battle_pairs(columns)
|
|
142
|
+
>>> print(res)
|
|
143
|
+
>>> [('B', 'A'), ('C', 'A'), ('C', 'B')]
|
|
144
|
+
"""
|
|
145
|
+
res_list = []
|
|
146
|
+
|
|
147
|
+
cols_num = len(columns)
|
|
148
|
+
if cols_num <= 0:
|
|
149
|
+
return res_list
|
|
150
|
+
|
|
151
|
+
if not compare_base:
|
|
152
|
+
mat = np.ones((cols_num, cols_num))
|
|
153
|
+
mat_lower_tril = np.tril(mat, k=-1)
|
|
154
|
+
x_ids, y_ids = np.where(mat_lower_tril == 1)
|
|
155
|
+
res_list = [(columns[x_id], columns[y_id])
|
|
156
|
+
for x_id, y_id in zip(x_ids, y_ids)]
|
|
157
|
+
else:
|
|
158
|
+
for column in columns[1:]:
|
|
159
|
+
res_list.append((columns[0], column))
|
|
160
|
+
|
|
161
|
+
if swap:
|
|
162
|
+
res_list.extend([(j, i) for i, j in res_list])
|
|
163
|
+
return res_list
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def shuffle_pairwise_preferences(
|
|
167
|
+
df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
|
|
168
|
+
"""Shuffle the outputs of a pairwise preference dataframe.
|
|
169
|
+
|
|
170
|
+
Examples
|
|
171
|
+
--------
|
|
172
|
+
>>> df = pd.DataFrame([dict(instruction='2+2', output_1='3', output_2='4', preference=2),
|
|
173
|
+
dict(instruction='2+3', output_1='5', output_2='4', preference=1)])
|
|
174
|
+
>>> print(shuffle_pairwise_preferences(df, [True, False]))
|
|
175
|
+
instruction output_1 output_2 preference
|
|
176
|
+
0 2+2 4 3 1
|
|
177
|
+
1 2+3 5 4 1
|
|
178
|
+
"""
|
|
179
|
+
col_1 = df['output_1'].copy()
|
|
180
|
+
col_2 = df['output_2'].copy()
|
|
181
|
+
df['output_1'] = np.where(arr_is_shuffle, col_2, col_1)
|
|
182
|
+
df['output_2'] = np.where(arr_is_shuffle, col_1, col_2)
|
|
183
|
+
|
|
184
|
+
if 'preference' in df.columns:
|
|
185
|
+
df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'],
|
|
186
|
+
df['preference'])
|
|
187
|
+
|
|
188
|
+
return df
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class BattlePairSelection:
|
|
192
|
+
"""
|
|
193
|
+
Select battle pairs by specific strategy.
|
|
194
|
+
|
|
195
|
+
Attributes:
|
|
196
|
+
model_elo_map(dict): map of model_id--base_elo_score
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
DEFAULT_K = 5
|
|
200
|
+
|
|
201
|
+
def __init__(self, model_elo_map: Union[dict, OrderedDict]):
|
|
202
|
+
# Make sure model_elo_map to be ordered when compare_base is true.
|
|
203
|
+
self.model_elo_map = model_elo_map
|
|
204
|
+
|
|
205
|
+
def top_k(self,
|
|
206
|
+
k: int = DEFAULT_K,
|
|
207
|
+
compare_base: bool = False,
|
|
208
|
+
swap: bool = False) -> list:
|
|
209
|
+
if k <= 0:
|
|
210
|
+
k = self.DEFAULT_K
|
|
211
|
+
sorted_res = sorted(self.model_elo_map.items(), key=lambda x: x[1])[:k]
|
|
212
|
+
sorted_res = list(dict(sorted_res).keys())
|
|
213
|
+
return get_battle_pairs_origin(sorted_res, compare_base, swap)
|
|
214
|
+
|
|
215
|
+
def random_k(self,
|
|
216
|
+
k: int = DEFAULT_K,
|
|
217
|
+
compare_base: bool = False,
|
|
218
|
+
swap: bool = False) -> list:
|
|
219
|
+
if k <= 0:
|
|
220
|
+
k = self.DEFAULT_K
|
|
221
|
+
if k > len(self.model_elo_map):
|
|
222
|
+
k = len(self.model_elo_map)
|
|
223
|
+
candidate_list = list(self.model_elo_map.items())
|
|
224
|
+
k = len(candidate_list) if k > len(candidate_list) else k
|
|
225
|
+
res = dict(random.sample(candidate_list, k=k))
|
|
226
|
+
res = list(res.keys())
|
|
227
|
+
return get_battle_pairs_origin(res, compare_base, swap)
|
|
228
|
+
|
|
229
|
+
def volatility_index(self,
|
|
230
|
+
frac: float = 0.2,
|
|
231
|
+
compare_base: bool = False,
|
|
232
|
+
swap: bool = False) -> list:
|
|
233
|
+
res_list = []
|
|
234
|
+
candidate_list = get_battle_pairs_origin(
|
|
235
|
+
list(self.model_elo_map.keys()), compare_base, swap)
|
|
236
|
+
for t in candidate_list:
|
|
237
|
+
model_a = t[0]
|
|
238
|
+
model_b = t[1]
|
|
239
|
+
base_elo_a = self.model_elo_map.get(model_a)
|
|
240
|
+
base_elo_b = self.model_elo_map.get(model_b)
|
|
241
|
+
|
|
242
|
+
vol_frac = abs(base_elo_b - base_elo_a) / max(
|
|
243
|
+
base_elo_a, base_elo_b)
|
|
244
|
+
if vol_frac <= frac:
|
|
245
|
+
res_list.append(t)
|
|
246
|
+
|
|
247
|
+
return res_list
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
|
|
4
|
+
import ast
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# from . import utils as ann_utils
|
|
9
|
+
from evalscope.constants import ArenaWinner
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
|
|
15
|
+
one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
|
|
19
|
+
# does not work with batched completions
|
|
20
|
+
def lmsys_parser(completion, output_format):
|
|
21
|
+
if output_format == '[[rating]]':
|
|
22
|
+
match = re.search(one_score_pattern, completion)
|
|
23
|
+
if not match:
|
|
24
|
+
match = re.search(one_score_pattern_backup, completion)
|
|
25
|
+
|
|
26
|
+
if match:
|
|
27
|
+
rating = ast.literal_eval(match.groups()[0])
|
|
28
|
+
else:
|
|
29
|
+
logger.error(f'Content: {completion}\n'
|
|
30
|
+
'You must manually fix the score.')
|
|
31
|
+
rating = -1
|
|
32
|
+
|
|
33
|
+
return rating
|
|
34
|
+
if output_format == '[[rating_a,rating_b]]':
|
|
35
|
+
try:
|
|
36
|
+
score_pair = completion.split('\n')[0]
|
|
37
|
+
score_pair = score_pair.replace(',', ' ')
|
|
38
|
+
sp = score_pair.split(' ')
|
|
39
|
+
if len(sp) == 2:
|
|
40
|
+
score_1 = float(sp[0])
|
|
41
|
+
score_2 = float(sp[1])
|
|
42
|
+
if score_1 > score_2:
|
|
43
|
+
winner = ArenaWinner.MODEL_A
|
|
44
|
+
elif score_1 < score_2:
|
|
45
|
+
winner = ArenaWinner.MODEL_B
|
|
46
|
+
else:
|
|
47
|
+
if score_1 == score_1 == -1:
|
|
48
|
+
winner = ArenaWinner.UNKNOWN
|
|
49
|
+
winner = ArenaWinner.TIE
|
|
50
|
+
return winner, [score_1, score_2]
|
|
51
|
+
else:
|
|
52
|
+
raise Exception('Invalid score pair.')
|
|
53
|
+
except Exception as e:
|
|
54
|
+
logger.error(
|
|
55
|
+
f'{e}\nContent: {completion}\nYou must manually fix the score pair.'
|
|
56
|
+
)
|
|
57
|
+
return ArenaWinner.UNKNOWN, [-1, -1]
|
|
58
|
+
elif output_format == '[[A]]':
|
|
59
|
+
if '[[A]]' in completion:
|
|
60
|
+
winner = ArenaWinner.MODEL_A
|
|
61
|
+
elif '[[B]]' in completion:
|
|
62
|
+
winner = ArenaWinner.MODEL_B
|
|
63
|
+
elif '[[C]]' in completion:
|
|
64
|
+
winner = ArenaWinner.TIE
|
|
65
|
+
else:
|
|
66
|
+
logger.error(
|
|
67
|
+
f'\nContent: {completion}\nYou must manually fix the score.')
|
|
68
|
+
winner = ArenaWinner.UNKNOWN
|
|
69
|
+
return winner
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def ranking_parser(completion, **kwargs):
|
|
73
|
+
try:
|
|
74
|
+
if isinstance(completion, str):
|
|
75
|
+
ordered_completions = ast.literal_eval(completion)
|
|
76
|
+
else:
|
|
77
|
+
ordered_completions = completion
|
|
78
|
+
|
|
79
|
+
rank = [c for c in ordered_completions
|
|
80
|
+
if c['model'] == 'model_a'][0]['rank']
|
|
81
|
+
assert rank in [1, 2]
|
|
82
|
+
|
|
83
|
+
return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error(f'{e}\nContent: {completion}\n'
|
|
86
|
+
'You must manually fix the score pair.')
|
|
87
|
+
return ArenaWinner.UNKNOWN
|