evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,140 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import os
4
+ import json
5
+ import glob
6
+ from tabulate import tabulate
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+ """
12
+ Combine and generate table for reports of LLMs.
13
+ """
14
+
15
+
16
+ def get_report(report_file: str):
17
+ data_d: dict = json.load(open(report_file, 'r'))
18
+ dataset_name = data_d['name']
19
+ score = data_d['score'] # float or dict
20
+ score_d = {}
21
+ if isinstance(score, dict):
22
+ # score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
23
+ score_d = score
24
+ elif isinstance(score, float):
25
+ # score_d['acc'] = round(score, 4) * 100
26
+ score_d['acc'] = score
27
+ else:
28
+ raise ValueError(f'Unknown score type: {type(score)}')
29
+ # score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
30
+ score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
31
+
32
+ return {'dataset_name': dataset_name, 'score': score_str}
33
+
34
+
35
+ def get_model_reports(model_report_dir: str):
36
+ model_report_dir = os.path.normpath(model_report_dir)
37
+ model_report_dir = model_report_dir.rstrip('reports')
38
+ model_info = os.path.basename(os.path.normpath(model_report_dir))
39
+ model_name = '_'.join(model_info.split('_')[:-1][3:])
40
+ report_files = glob.glob(os.path.join(model_report_dir, 'reports', '*.json'))
41
+
42
+ model_reports_d = {model_name: []}
43
+ for file_path in report_files:
44
+ report_d = get_report(file_path)
45
+ model_reports_d[model_name].append(report_d)
46
+
47
+ return model_reports_d
48
+
49
+
50
+ def gen_table(reports_path_list: list):
51
+ table_values = []
52
+ headers = ['Model']
53
+ is_headers_set = False
54
+
55
+ for report_path in reports_path_list:
56
+ model_reports_d = get_model_reports(report_path)
57
+ for model_name, report_list in model_reports_d.items():
58
+ # report_list: [{'dataset_name': 'CompetitionMath', 'score': '4.42 (acc)'},
59
+ # {'dataset_name': 'GSM8K', 'score': '28.51 (acc)'}]
60
+ report_list = sorted(report_list, key=lambda x: x['dataset_name'])
61
+ if not is_headers_set:
62
+ headers.extend([x['dataset_name'] for x in report_list])
63
+ is_headers_set = True
64
+ single_row = []
65
+ single_row.append(model_name)
66
+ for single_report in report_list:
67
+ # e.g. '28.51 (acc)'
68
+ single_row.append(single_report['score'])
69
+ table_values.append(single_row)
70
+
71
+ report_table = tabulate(table_values, headers=headers, tablefmt='grid')
72
+ return report_table
73
+
74
+ class ReportsRecorder:
75
+ COMMON_DATASET_PATH = []
76
+ CUSTOM_DATASET_PATH = []
77
+
78
+ def __init__(self, oss_url: str = "", endpoint: str = ""):
79
+ if oss_url and endpoint:
80
+ import oss2
81
+ from oss2.credentials import EnvironmentVariableCredentialsProvider
82
+
83
+ auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
84
+ oss_url = oss_url.replace("oss://", "").split('/')
85
+ bucket_name = oss_url[0]
86
+
87
+ self.object_path = "/".join(oss_url[1:])
88
+ self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
89
+ else:
90
+ self.object_path = ""
91
+ self.bucket = None
92
+
93
+
94
+ def append_path(self, report_path: str, dataset_name: str):
95
+ if dataset_name == "general_qa":
96
+ self.CUSTOM_DATASET_PATH.append(report_path)
97
+ else:
98
+ self.COMMON_DATASET_PATH.append(report_path)
99
+
100
+ def dump_reports(self, output_dir: str):
101
+ result = {
102
+ "CommonDataset": [],
103
+ "CustomDataset": []
104
+ }
105
+ for line in self.COMMON_DATASET_PATH:
106
+ with open(line, 'r') as f:
107
+ report = json.load(f)
108
+ result['CommonDataset'].append(report)
109
+ for line in self.CUSTOM_DATASET_PATH:
110
+ with open(line, 'r') as f:
111
+ report = json.load(f)
112
+ report.update({"name": os.path.basename(line)})
113
+ result['CustomDataset'].append(report)
114
+
115
+ os.makedirs(output_dir, exist_ok=True)
116
+ output_file_name = "metric.json"
117
+ output_path = os.path.join(output_dir, output_file_name)
118
+ with open(output_path, 'w+') as f:
119
+ f.write(json.dumps(result, ensure_ascii=False, indent=4))
120
+
121
+ if self.bucket:
122
+ remote_path = os.path.join(self.object_path, output_file_name)
123
+ logger.info(f"** Upload report to oss: {remote_path}")
124
+ self.bucket.put_object_from_file(remote_path, output_path)
125
+
126
+ if __name__ == '__main__':
127
+ report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
128
+ report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
129
+
130
+ report_table = gen_table([report_dir_1, report_dir_2])
131
+ print(report_table)
132
+
133
+ # ALL VALUES ONLY FOR EXAMPLE
134
+ # +--------------------------+-------------------+-------------+
135
+ # | Model | CompetitionMath | GSM8K |
136
+ # +==========================+===================+=============+
137
+ # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
138
+ # +--------------------------+-------------------+-------------+
139
+ # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
140
+ # +--------------------------+-------------------+-------------+
@@ -0,0 +1,90 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ # Note: refer to https://github.com/hendrycks/test/blob/master/categories.py
4
+
5
+ subcategories = {
6
+ 'abstract_algebra': ['math'],
7
+ 'anatomy': ['health'],
8
+ 'astronomy': ['physics'],
9
+ 'business_ethics': ['business'],
10
+ 'clinical_knowledge': ['health'],
11
+ 'college_biology': ['biology'],
12
+ 'college_chemistry': ['chemistry'],
13
+ 'college_computer_science': ['computer science'],
14
+ 'college_mathematics': ['math'],
15
+ 'college_medicine': ['health'],
16
+ 'college_physics': ['physics'],
17
+ 'computer_security': ['computer science'],
18
+ 'conceptual_physics': ['physics'],
19
+ 'econometrics': ['economics'],
20
+ 'electrical_engineering': ['engineering'],
21
+ 'elementary_mathematics': ['math'],
22
+ 'formal_logic': ['philosophy'],
23
+ 'global_facts': ['other'],
24
+ 'high_school_biology': ['biology'],
25
+ 'high_school_chemistry': ['chemistry'],
26
+ 'high_school_computer_science': ['computer science'],
27
+ 'high_school_european_history': ['history'],
28
+ 'high_school_geography': ['geography'],
29
+ 'high_school_government_and_politics': ['politics'],
30
+ 'high_school_macroeconomics': ['economics'],
31
+ 'high_school_mathematics': ['math'],
32
+ 'high_school_microeconomics': ['economics'],
33
+ 'high_school_physics': ['physics'],
34
+ 'high_school_psychology': ['psychology'],
35
+ 'high_school_statistics': ['math'],
36
+ 'high_school_us_history': ['history'],
37
+ 'high_school_world_history': ['history'],
38
+ 'human_aging': ['health'],
39
+ 'human_sexuality': ['culture'],
40
+ 'international_law': ['law'],
41
+ 'jurisprudence': ['law'],
42
+ 'logical_fallacies': ['philosophy'],
43
+ 'machine_learning': ['computer science'],
44
+ 'management': ['business'],
45
+ 'marketing': ['business'],
46
+ 'medical_genetics': ['health'],
47
+ 'miscellaneous': ['other'],
48
+ 'moral_disputes': ['philosophy'],
49
+ 'moral_scenarios': ['philosophy'],
50
+ 'nutrition': ['health'],
51
+ 'philosophy': ['philosophy'],
52
+ 'prehistory': ['history'],
53
+ 'professional_accounting': ['other'],
54
+ 'professional_law': ['law'],
55
+ 'professional_medicine': ['health'],
56
+ 'professional_psychology': ['psychology'],
57
+ 'public_relations': ['politics'],
58
+ 'security_studies': ['politics'],
59
+ 'sociology': ['culture'],
60
+ 'us_foreign_policy': ['politics'],
61
+ 'virology': ['health'],
62
+ 'world_religions': ['philosophy'],
63
+ }
64
+
65
+ categories = {
66
+ 'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
67
+ 'Humanities': ['history', 'philosophy', 'law'],
68
+ 'Social Science': ['politics', 'culture', 'economics', 'geography', 'psychology'],
69
+ 'Other': ['other', 'business', 'health'],
70
+ }
71
+
72
+
73
+ def main():
74
+
75
+ reversed_categories = {}
76
+ for category, subcategory_list in categories.items():
77
+ for subcategory in subcategory_list:
78
+ reversed_categories[subcategory] = category
79
+
80
+ subject_mapping = {}
81
+ for subject, subcategory_list in subcategories.items():
82
+ category_name: str = reversed_categories[subcategory_list[0]]
83
+ subject_show_name: str = ' '.join([item.capitalize() for item in subject.split('_')])
84
+ subject_mapping[subject] = [subject_show_name, subcategory_list[0], category_name]
85
+
86
+ print(subject_mapping)
87
+
88
+
89
+ if __name__ == '__main__':
90
+ main()
@@ -0,0 +1,95 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ import time
4
+
5
+ from evalscope.models.custom import CustomModel
6
+ from evalscope.run import run_task
7
+ from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
8
+ from evalscope.utils import yaml_to_dict
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+ """
14
+ This script is used to rewrite the evaluation results without re-running the model predictions.
15
+ """
16
+
17
+
18
+ class DummyCustomModel(CustomModel):
19
+
20
+ def __init__(self, config: dict, **kwargs):
21
+ super(DummyCustomModel, self).__init__(config=config, **kwargs)
22
+
23
+ def predict(self, prompts: str, **kwargs):
24
+ # ONLY FOR DUMMY IMPLEMENTATION, DO NOT EDIT OR USE IN PRODUCTION.
25
+
26
+ response = 'The answer is C. NOTE: ONLY FOR TEST'
27
+
28
+ res_d: dict = {
29
+ 'choices': [
30
+ {
31
+ 'index': 0,
32
+ 'message': {
33
+ # 'content': f'The answer is B. Raw prompt: {prompt}',
34
+ 'content': response,
35
+ 'role': 'assistant'
36
+ }
37
+ }
38
+ ],
39
+ 'created': time.time(),
40
+ 'model': self.config.get('model_id'), # should be model_id
41
+ 'object': 'chat.completion',
42
+ 'usage': {
43
+ 'completion_tokens': 0,
44
+ 'prompt_tokens': 0,
45
+ 'total_tokens': 0
46
+ }
47
+ }
48
+
49
+ return [res_d for _ in prompts]
50
+
51
+
52
+ def get_task_cfg(cfg_file: str, model_instance: CustomModel):
53
+ if cfg_file:
54
+ cfg_file: str = os.path.abspath(cfg_file)
55
+ logger.info(f'Loading task config from {cfg_file}')
56
+ task_cfg_d: dict = yaml_to_dict(yaml_file=cfg_file)
57
+ task_cfg_d.update({'model': model_instance})
58
+ logger.info(f'**Task config: {task_cfg_d}')
59
+ else:
60
+ # 默认config 示例
61
+ task_cfg_d = {
62
+ 'model_args': {},
63
+ 'generation_config': {},
64
+ 'dataset_args': {},
65
+ 'dry_run': False,
66
+ 'model': model_instance, # NOTE: model_id or # model_dir or model_instance(CustomModel)
67
+ 'eval_type': 'custom', # NOTE: `checkpoint` or `custom` or `service`
68
+ 'datasets': ['arc'],
69
+ 'work_dir': DEFAULT_ROOT_CACHE_DIR,
70
+ 'outputs': './outputs/eval_swift_dummy',
71
+ 'mem_cache': False,
72
+ 'dataset_hub': 'ModelScope',
73
+ 'dataset_dir': DEFAULT_ROOT_CACHE_DIR,
74
+ 'stage': 'all',
75
+ 'limit': 10,
76
+ 'debug': False
77
+ }
78
+
79
+ return task_cfg_d
80
+
81
+
82
+ if __name__ == '__main__':
83
+ # step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
84
+ # step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
85
+
86
+ swift_model = DummyCustomModel(config={'model_id': 'swift-model-dummy'})
87
+
88
+ task_cfg_file = '/path/to/eval_your_model_results/configs/task_output_config.yaml'
89
+
90
+ task_cfg_d = yaml_to_dict(task_cfg_file)
91
+ task_cfg_d.update({'model': swift_model})
92
+
93
+ eval_results: dict = run_task(task_cfg=task_cfg_d)
94
+ print(f'** Evaluation results finished !\n')
95
+
@@ -0,0 +1,4 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.utils.utils import *
4
+ from evalscope.utils.task_utils import *
@@ -0,0 +1,247 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright (c) lmsys.org.
3
+
4
+ import random
5
+ from collections import OrderedDict, defaultdict
6
+ from typing import List, Sequence, Union
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import pyarrow as pa
11
+
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ def compute_elo(battles,
18
+ col_model_a='model_a',
19
+ col_model_b='model_b',
20
+ col_win='win',
21
+ tie_values=['tie', 'tie (bothbad)'],
22
+ k=32,
23
+ scale=400,
24
+ base=10,
25
+ init_rating=1000):
26
+ rating = defaultdict(lambda: init_rating)
27
+
28
+ for rd, model_a, model_b, win in battles[[
29
+ col_model_a, col_model_b, col_win
30
+ ]].itertuples():
31
+ ra = rating[model_a]
32
+ rb = rating[model_b]
33
+ ea = 1 / (1 + base**((rb - ra) / scale))
34
+ eb = 1 / (1 + base**((ra - rb) / scale))
35
+ if win == col_model_a:
36
+ sa = 1
37
+ elif win == col_model_b:
38
+ sa = 0
39
+ elif win in tie_values:
40
+ sa = 0.5
41
+ else:
42
+ raise Exception(f'unexpected vote {win}')
43
+ rating[model_a] += k * (sa - ea)
44
+ rating[model_b] += k * (1 - sa - eb)
45
+
46
+ return rating
47
+
48
+
49
+ def merge_ques_ans(answer_list_all,
50
+ merge_key: str = 'question_id',
51
+ merge_mode: str = 'inner') -> pd.DataFrame:
52
+ """
53
+ Merge question and answer list to unifiled data.
54
+
55
+ Args:
56
+ answer_list_all: list of answer list,
57
+ e.g. [ans1_list, ans2_list, ...], an ans_list is predicted answers
58
+ of a specific model, must contain following columns: 'question_id',
59
+ 'text', 'category', 'model_id', 'answer'
60
+ merge_key: key for dataframe merging
61
+ merge_mode: mode for dataframe merging,
62
+ e.g. 'inner', 'left', 'right', 'outer'
63
+
64
+ Returns:
65
+ pandas DataFrame: merged dataframe, e.g. columns are
66
+ ['question_id', 'gpt-3.5-turbo', 'llama2-7b']
67
+ """
68
+ ans_df = pd.DataFrame()
69
+ for ans_list in answer_list_all:
70
+ ans_list = [{
71
+ 'question_id': item['question_id'],
72
+ item['model_id']: item
73
+ } for item in ans_list]
74
+ if ans_df.empty:
75
+ ans_df = pa.Table.from_pylist(ans_list).to_pandas()
76
+ else:
77
+ ans_df = pd.merge(
78
+ ans_df,
79
+ pa.Table.from_pylist(ans_list).to_pandas(),
80
+ on=merge_key,
81
+ how=merge_mode)
82
+
83
+ return ans_df
84
+
85
+
86
+ def get_battle_pairs(columns: List[str], baseline_idx: int = -1) -> List[tuple]:
87
+ """
88
+ Get battle pair names from columns.
89
+
90
+ Args:
91
+ columns: list of column names.
92
+
93
+ Returns:
94
+ list of battle pairs.
95
+
96
+ Example:
97
+ >>> columns = ['A', 'B', 'C']
98
+ >>> res = get_battle_pairs(columns)
99
+ >>> print(res)
100
+ >>> [('B', 'A'), ('C', 'A'), ('C', 'B')]
101
+
102
+ >>> columns = ['A', 'B', 'C']
103
+ >>> res = get_battle_pairs(columns, 2)
104
+ >>> print(res)
105
+ >>> [('A', 'C'), ('B', 'C')]
106
+ """
107
+ res_list = []
108
+
109
+ cols_num = len(columns)
110
+ if cols_num <= 0:
111
+ return res_list
112
+
113
+ if baseline_idx != -1:
114
+ n_column = columns[baseline_idx]
115
+ res_list = [(column, n_column) for column in columns
116
+ if column != n_column]
117
+ else:
118
+ mat = np.ones((cols_num, cols_num))
119
+ mat_lower_tril = np.tril(mat, k=-1)
120
+ x_ids, y_ids = np.where(mat_lower_tril == 1)
121
+ res_list = [(columns[x_id], columns[y_id])
122
+ for x_id, y_id in zip(x_ids, y_ids)]
123
+
124
+ return res_list
125
+
126
+
127
+ def get_battle_pairs_origin(columns: List[str],
128
+ compare_base: bool = False,
129
+ swap: bool = False): # TODO: to refactor
130
+ """
131
+ Get battle pair names from columns.
132
+
133
+ Args:
134
+ columns: list of column names.
135
+
136
+ Returns:
137
+ list of battle pairs.
138
+
139
+ Example:
140
+ >>> columns = ['A', 'B', 'C']
141
+ >>> res = get_battle_pairs(columns)
142
+ >>> print(res)
143
+ >>> [('B', 'A'), ('C', 'A'), ('C', 'B')]
144
+ """
145
+ res_list = []
146
+
147
+ cols_num = len(columns)
148
+ if cols_num <= 0:
149
+ return res_list
150
+
151
+ if not compare_base:
152
+ mat = np.ones((cols_num, cols_num))
153
+ mat_lower_tril = np.tril(mat, k=-1)
154
+ x_ids, y_ids = np.where(mat_lower_tril == 1)
155
+ res_list = [(columns[x_id], columns[y_id])
156
+ for x_id, y_id in zip(x_ids, y_ids)]
157
+ else:
158
+ for column in columns[1:]:
159
+ res_list.append((columns[0], column))
160
+
161
+ if swap:
162
+ res_list.extend([(j, i) for i, j in res_list])
163
+ return res_list
164
+
165
+
166
+ def shuffle_pairwise_preferences(
167
+ df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
168
+ """Shuffle the outputs of a pairwise preference dataframe.
169
+
170
+ Examples
171
+ --------
172
+ >>> df = pd.DataFrame([dict(instruction='2+2', output_1='3', output_2='4', preference=2),
173
+ dict(instruction='2+3', output_1='5', output_2='4', preference=1)])
174
+ >>> print(shuffle_pairwise_preferences(df, [True, False]))
175
+ instruction output_1 output_2 preference
176
+ 0 2+2 4 3 1
177
+ 1 2+3 5 4 1
178
+ """
179
+ col_1 = df['output_1'].copy()
180
+ col_2 = df['output_2'].copy()
181
+ df['output_1'] = np.where(arr_is_shuffle, col_2, col_1)
182
+ df['output_2'] = np.where(arr_is_shuffle, col_1, col_2)
183
+
184
+ if 'preference' in df.columns:
185
+ df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'],
186
+ df['preference'])
187
+
188
+ return df
189
+
190
+
191
+ class BattlePairSelection:
192
+ """
193
+ Select battle pairs by specific strategy.
194
+
195
+ Attributes:
196
+ model_elo_map(dict): map of model_id--base_elo_score
197
+ """
198
+
199
+ DEFAULT_K = 5
200
+
201
+ def __init__(self, model_elo_map: Union[dict, OrderedDict]):
202
+ # Make sure model_elo_map to be ordered when compare_base is true.
203
+ self.model_elo_map = model_elo_map
204
+
205
+ def top_k(self,
206
+ k: int = DEFAULT_K,
207
+ compare_base: bool = False,
208
+ swap: bool = False) -> list:
209
+ if k <= 0:
210
+ k = self.DEFAULT_K
211
+ sorted_res = sorted(self.model_elo_map.items(), key=lambda x: x[1])[:k]
212
+ sorted_res = list(dict(sorted_res).keys())
213
+ return get_battle_pairs_origin(sorted_res, compare_base, swap)
214
+
215
+ def random_k(self,
216
+ k: int = DEFAULT_K,
217
+ compare_base: bool = False,
218
+ swap: bool = False) -> list:
219
+ if k <= 0:
220
+ k = self.DEFAULT_K
221
+ if k > len(self.model_elo_map):
222
+ k = len(self.model_elo_map)
223
+ candidate_list = list(self.model_elo_map.items())
224
+ k = len(candidate_list) if k > len(candidate_list) else k
225
+ res = dict(random.sample(candidate_list, k=k))
226
+ res = list(res.keys())
227
+ return get_battle_pairs_origin(res, compare_base, swap)
228
+
229
+ def volatility_index(self,
230
+ frac: float = 0.2,
231
+ compare_base: bool = False,
232
+ swap: bool = False) -> list:
233
+ res_list = []
234
+ candidate_list = get_battle_pairs_origin(
235
+ list(self.model_elo_map.keys()), compare_base, swap)
236
+ for t in candidate_list:
237
+ model_a = t[0]
238
+ model_b = t[1]
239
+ base_elo_a = self.model_elo_map.get(model_a)
240
+ base_elo_b = self.model_elo_map.get(model_b)
241
+
242
+ vol_frac = abs(base_elo_b - base_elo_a) / max(
243
+ base_elo_a, base_elo_b)
244
+ if vol_frac <= frac:
245
+ res_list.append(t)
246
+
247
+ return res_list
@@ -0,0 +1,87 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa
3
+
4
+ import ast
5
+ import re
6
+
7
+
8
+ # from . import utils as ann_utils
9
+ from evalscope.constants import ArenaWinner
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
15
+ one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
16
+
17
+
18
+ # modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
19
+ # does not work with batched completions
20
+ def lmsys_parser(completion, output_format):
21
+ if output_format == '[[rating]]':
22
+ match = re.search(one_score_pattern, completion)
23
+ if not match:
24
+ match = re.search(one_score_pattern_backup, completion)
25
+
26
+ if match:
27
+ rating = ast.literal_eval(match.groups()[0])
28
+ else:
29
+ logger.error(f'Content: {completion}\n'
30
+ 'You must manually fix the score.')
31
+ rating = -1
32
+
33
+ return rating
34
+ if output_format == '[[rating_a,rating_b]]':
35
+ try:
36
+ score_pair = completion.split('\n')[0]
37
+ score_pair = score_pair.replace(',', ' ')
38
+ sp = score_pair.split(' ')
39
+ if len(sp) == 2:
40
+ score_1 = float(sp[0])
41
+ score_2 = float(sp[1])
42
+ if score_1 > score_2:
43
+ winner = ArenaWinner.MODEL_A
44
+ elif score_1 < score_2:
45
+ winner = ArenaWinner.MODEL_B
46
+ else:
47
+ if score_1 == score_1 == -1:
48
+ winner = ArenaWinner.UNKNOWN
49
+ winner = ArenaWinner.TIE
50
+ return winner, [score_1, score_2]
51
+ else:
52
+ raise Exception('Invalid score pair.')
53
+ except Exception as e:
54
+ logger.error(
55
+ f'{e}\nContent: {completion}\nYou must manually fix the score pair.'
56
+ )
57
+ return ArenaWinner.UNKNOWN, [-1, -1]
58
+ elif output_format == '[[A]]':
59
+ if '[[A]]' in completion:
60
+ winner = ArenaWinner.MODEL_A
61
+ elif '[[B]]' in completion:
62
+ winner = ArenaWinner.MODEL_B
63
+ elif '[[C]]' in completion:
64
+ winner = ArenaWinner.TIE
65
+ else:
66
+ logger.error(
67
+ f'\nContent: {completion}\nYou must manually fix the score.')
68
+ winner = ArenaWinner.UNKNOWN
69
+ return winner
70
+
71
+
72
+ def ranking_parser(completion, **kwargs):
73
+ try:
74
+ if isinstance(completion, str):
75
+ ordered_completions = ast.literal_eval(completion)
76
+ else:
77
+ ordered_completions = completion
78
+
79
+ rank = [c for c in ordered_completions
80
+ if c['model'] == 'model_a'][0]['rank']
81
+ assert rank in [1, 2]
82
+
83
+ return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
84
+ except Exception as e:
85
+ logger.error(f'{e}\nContent: {completion}\n'
86
+ 'You must manually fix the score pair.')
87
+ return ArenaWinner.UNKNOWN