evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,103 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import openai
4
- import os
5
- import time
6
-
7
- from evalscope.models import ChatBaseModel
8
- from evalscope.utils.logger import get_logger
9
-
10
- logger = get_logger()
11
-
12
-
13
- class OpenAIModel(ChatBaseModel):
14
- """
15
- APIs of OpenAI models.
16
- Available models: gpt-3.5-turbo, gpt-4
17
- """
18
-
19
- MAX_RETRIES = 3
20
-
21
- def __init__(self, model_cfg: dict, **kwargs):
22
- super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
23
-
24
- openai_api_key = os.environ.get('OPENAI_API_KEY', None)
25
- self.api_key = self.model_cfg.get('api_key', openai_api_key)
26
-
27
- if not self.api_key:
28
- logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
29
- # raise ValueError(
30
- # 'OpenAI API key is not provided, '
31
- # 'please set it in environment variable OPENAI_API_KEY')
32
-
33
- def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
34
-
35
- sys_prompt: str = inputs.get('sys_prompt', '')
36
- user_prompt: str = inputs.get('user_prompt', '')
37
-
38
- # model_id: str = kwargs.get('model_id', '')
39
- temperature: float = kwargs.pop('temperature', 0.2)
40
- max_tokens: int = kwargs.pop('max_tokens', 1024)
41
- mode: str = kwargs.pop('mode', 'chat.completion')
42
-
43
- logger.info(f'Using OpenAI model_id: {model_id}')
44
-
45
- res = self._predict(
46
- model_id=model_id,
47
- sys_prompt=sys_prompt,
48
- user_prompt=user_prompt,
49
- temperature=temperature,
50
- max_tokens=max_tokens,
51
- mode=mode)
52
-
53
- return res
54
-
55
- def _predict(
56
- self,
57
- model_id,
58
- sys_prompt,
59
- user_prompt,
60
- temperature,
61
- max_tokens,
62
- mode: str = 'chat.completion',
63
- ) -> dict:
64
-
65
- res = {}
66
- openai.api_key = self.api_key
67
-
68
- for i in range(self.MAX_RETRIES):
69
- try:
70
- if mode == 'chat.completion':
71
- resp = openai.ChatCompletion.create(
72
- model=model_id,
73
- messages=[{
74
- 'role': 'system',
75
- 'content': sys_prompt
76
- }, {
77
- 'role': 'user',
78
- 'content': user_prompt
79
- }],
80
- temperature=temperature,
81
- max_tokens=max_tokens)
82
-
83
- if resp:
84
- ans_text = resp['choices'][0]['message']['content']
85
- model_id = resp['model']
86
- else:
87
- logger.warning(f'OpenAI GPT API call failed: got empty response '
88
- f'for input {sys_prompt} {user_prompt}')
89
- ans_text = ''
90
- model_id = ''
91
-
92
- res['ans_text'] = ans_text
93
- res['model_id'] = model_id
94
- else:
95
- raise ValueError(f'Invalid mode: {mode}')
96
-
97
- return res
98
-
99
- except Exception as e:
100
- logger.warning(f'OpenAI API call failed: {e}')
101
- time.sleep(3)
102
- logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
103
- return res
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -1,133 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import glob
4
- import json
5
- import os
6
- from collections import defaultdict
7
- from tabulate import tabulate
8
-
9
- from evalscope.utils.logger import get_logger
10
-
11
- logger = get_logger()
12
- """
13
- Combine and generate table for reports of LLMs.
14
- """
15
-
16
-
17
- def get_report(report_file: str):
18
- data_d: dict = json.load(open(report_file, 'r'))
19
- dataset_name = data_d['dataset_name']
20
- model_name = data_d['model_name']
21
- score = data_d['score'] # float or dict
22
- metric = data_d['metric']
23
- score_d = {}
24
- if isinstance(score, dict):
25
- score_d = score
26
- elif isinstance(score, float):
27
- score_d[metric] = score
28
- else:
29
- raise ValueError(f'Unknown score type: {type(score)}')
30
- score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
31
-
32
- return model_name, {'dataset_name': dataset_name, 'score': score_str}
33
-
34
-
35
- def get_model_reports(model_report_dir: str):
36
- model_report_dir = os.path.normpath(model_report_dir)
37
- report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
38
-
39
- model_reports_d = defaultdict(list)
40
- for file_path in report_files:
41
- model_name, report_d = get_report(file_path)
42
- model_reports_d[model_name].append(report_d)
43
-
44
- return model_reports_d
45
-
46
-
47
- def gen_table(reports_path_list: list):
48
- table_values = []
49
- headers = ['Model']
50
- is_headers_set = False
51
-
52
- for report_path in reports_path_list:
53
- model_reports_d = get_model_reports(report_path)
54
- for model_name, report_list in model_reports_d.items():
55
- report_list = sorted(report_list, key=lambda x: x['dataset_name'])
56
- if not is_headers_set:
57
- headers.extend([x['dataset_name'] for x in report_list])
58
- is_headers_set = True
59
- single_row = []
60
- single_row.append(model_name)
61
- for single_report in report_list:
62
- # e.g. '28.51 (acc)'
63
- single_row.append(single_report['score'])
64
- table_values.append(single_row)
65
-
66
- report_table = tabulate(table_values, headers=headers, tablefmt='grid')
67
- return report_table
68
-
69
-
70
- class ReportsRecorder:
71
- COMMON_DATASET_PATH = []
72
- CUSTOM_DATASET_PATH = []
73
-
74
- def __init__(self, oss_url: str = '', endpoint: str = ''):
75
- if oss_url and endpoint:
76
- import oss2
77
- from oss2.credentials import EnvironmentVariableCredentialsProvider
78
-
79
- auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
80
- oss_url = oss_url.replace('oss://', '').split('/')
81
- bucket_name = oss_url[0]
82
-
83
- self.object_path = '/'.join(oss_url[1:])
84
- self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
85
- else:
86
- self.object_path = ''
87
- self.bucket = None
88
-
89
- def append_path(self, report_path: str, dataset_name: str):
90
- if dataset_name == 'general_qa':
91
- self.CUSTOM_DATASET_PATH.append(report_path)
92
- else:
93
- self.COMMON_DATASET_PATH.append(report_path)
94
-
95
- def dump_reports(self, output_dir: str):
96
- result = {'CommonDataset': [], 'CustomDataset': []}
97
- for line in self.COMMON_DATASET_PATH:
98
- with open(line, 'r') as f:
99
- report = json.load(f)
100
- result['CommonDataset'].append(report)
101
- for line in self.CUSTOM_DATASET_PATH:
102
- with open(line, 'r') as f:
103
- report = json.load(f)
104
- report.update({'name': os.path.basename(line)})
105
- result['CustomDataset'].append(report)
106
-
107
- os.makedirs(output_dir, exist_ok=True)
108
- output_file_name = 'metric.json'
109
- output_path = os.path.join(output_dir, output_file_name)
110
- with open(output_path, 'w+') as f:
111
- f.write(json.dumps(result, ensure_ascii=False, indent=4))
112
-
113
- if self.bucket:
114
- remote_path = os.path.join(self.object_path, output_file_name)
115
- logger.info(f'** Upload report to oss: {remote_path}')
116
- self.bucket.put_object_from_file(remote_path, output_path)
117
-
118
-
119
- if __name__ == '__main__':
120
- report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
121
- report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
122
-
123
- report_table = gen_table([report_dir_1, report_dir_2])
124
- print(report_table)
125
-
126
- # ALL VALUES ONLY FOR EXAMPLE
127
- # +--------------------------+-------------------+-------------+
128
- # | Model | CompetitionMath | GSM8K |
129
- # +==========================+===================+=============+
130
- # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
131
- # +--------------------------+-------------------+-------------+
132
- # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
133
- # +--------------------------+-------------------+-------------+
@@ -1,90 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- # Note: refer to https://github.com/hendrycks/test/blob/master/categories.py
4
-
5
- subcategories = {
6
- 'abstract_algebra': ['math'],
7
- 'anatomy': ['health'],
8
- 'astronomy': ['physics'],
9
- 'business_ethics': ['business'],
10
- 'clinical_knowledge': ['health'],
11
- 'college_biology': ['biology'],
12
- 'college_chemistry': ['chemistry'],
13
- 'college_computer_science': ['computer science'],
14
- 'college_mathematics': ['math'],
15
- 'college_medicine': ['health'],
16
- 'college_physics': ['physics'],
17
- 'computer_security': ['computer science'],
18
- 'conceptual_physics': ['physics'],
19
- 'econometrics': ['economics'],
20
- 'electrical_engineering': ['engineering'],
21
- 'elementary_mathematics': ['math'],
22
- 'formal_logic': ['philosophy'],
23
- 'global_facts': ['other'],
24
- 'high_school_biology': ['biology'],
25
- 'high_school_chemistry': ['chemistry'],
26
- 'high_school_computer_science': ['computer science'],
27
- 'high_school_european_history': ['history'],
28
- 'high_school_geography': ['geography'],
29
- 'high_school_government_and_politics': ['politics'],
30
- 'high_school_macroeconomics': ['economics'],
31
- 'high_school_mathematics': ['math'],
32
- 'high_school_microeconomics': ['economics'],
33
- 'high_school_physics': ['physics'],
34
- 'high_school_psychology': ['psychology'],
35
- 'high_school_statistics': ['math'],
36
- 'high_school_us_history': ['history'],
37
- 'high_school_world_history': ['history'],
38
- 'human_aging': ['health'],
39
- 'human_sexuality': ['culture'],
40
- 'international_law': ['law'],
41
- 'jurisprudence': ['law'],
42
- 'logical_fallacies': ['philosophy'],
43
- 'machine_learning': ['computer science'],
44
- 'management': ['business'],
45
- 'marketing': ['business'],
46
- 'medical_genetics': ['health'],
47
- 'miscellaneous': ['other'],
48
- 'moral_disputes': ['philosophy'],
49
- 'moral_scenarios': ['philosophy'],
50
- 'nutrition': ['health'],
51
- 'philosophy': ['philosophy'],
52
- 'prehistory': ['history'],
53
- 'professional_accounting': ['other'],
54
- 'professional_law': ['law'],
55
- 'professional_medicine': ['health'],
56
- 'professional_psychology': ['psychology'],
57
- 'public_relations': ['politics'],
58
- 'security_studies': ['politics'],
59
- 'sociology': ['culture'],
60
- 'us_foreign_policy': ['politics'],
61
- 'virology': ['health'],
62
- 'world_religions': ['philosophy'],
63
- }
64
-
65
- categories = {
66
- 'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
67
- 'Humanities': ['history', 'philosophy', 'law'],
68
- 'Social Science': ['politics', 'culture', 'economics', 'geography', 'psychology'],
69
- 'Other': ['other', 'business', 'health'],
70
- }
71
-
72
-
73
- def main():
74
-
75
- reversed_categories = {}
76
- for category, subcategory_list in categories.items():
77
- for subcategory in subcategory_list:
78
- reversed_categories[subcategory] = category
79
-
80
- subject_mapping = {}
81
- for subject, subcategory_list in subcategories.items():
82
- category_name: str = reversed_categories[subcategory_list[0]]
83
- subject_show_name: str = ' '.join([item.capitalize() for item in subject.split('_')])
84
- subject_mapping[subject] = [subject_show_name, subcategory_list[0], category_name]
85
-
86
- print(subject_mapping)
87
-
88
-
89
- if __name__ == '__main__':
90
- main()