evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/base.py +1 -1
  4. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  5. evalscope/backend/rag_eval/utils/clip.py +2 -2
  6. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  7. evalscope/backend/rag_eval/utils/llm.py +1 -1
  8. evalscope/benchmarks/__init__.py +20 -1
  9. evalscope/benchmarks/arc/__init__.py +0 -5
  10. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  11. evalscope/benchmarks/bbh/__init__.py +0 -4
  12. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  13. evalscope/benchmarks/benchmark.py +70 -59
  14. evalscope/benchmarks/ceval/__init__.py +0 -5
  15. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  16. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  17. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  18. evalscope/benchmarks/competition_math/__init__.py +0 -5
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  20. evalscope/benchmarks/data_adapter.py +115 -87
  21. evalscope/benchmarks/general_qa/__init__.py +0 -5
  22. evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
  23. evalscope/benchmarks/gpqa/__init__.py +0 -0
  24. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  26. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  27. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
  28. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  29. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
  30. evalscope/benchmarks/humaneval/__init__.py +0 -4
  31. evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
  32. evalscope/benchmarks/ifeval/__init__.py +0 -0
  33. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  34. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  35. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  36. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  37. evalscope/benchmarks/ifeval/utils.py +134 -0
  38. evalscope/benchmarks/iquiz/__init__.py +0 -0
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  40. evalscope/benchmarks/mmlu/__init__.py +0 -5
  41. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  42. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  43. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  44. evalscope/benchmarks/race/__init__.py +0 -5
  45. evalscope/benchmarks/race/race_adapter.py +27 -123
  46. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  48. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  49. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  50. evalscope/cli/cli.py +2 -0
  51. evalscope/cli/start_app.py +30 -0
  52. evalscope/collections/__init__.py +3 -0
  53. evalscope/collections/evaluator.py +198 -0
  54. evalscope/collections/sampler.py +138 -0
  55. evalscope/collections/schema.py +126 -0
  56. evalscope/config.py +45 -7
  57. evalscope/constants.py +7 -38
  58. evalscope/evaluator/__init__.py +0 -1
  59. evalscope/evaluator/evaluator.py +89 -121
  60. evalscope/evaluator/rating_eval.py +1 -1
  61. evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
  62. evalscope/metrics/__init__.py +3 -0
  63. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  64. evalscope/metrics/math_accuracy.py +193 -50
  65. evalscope/metrics/metrics.py +18 -6
  66. evalscope/metrics/named_metrics.py +17 -0
  67. evalscope/metrics/rouge_metric.py +13 -8
  68. evalscope/models/__init__.py +14 -1
  69. evalscope/models/base_adapter.py +52 -0
  70. evalscope/models/chat_adapter.py +140 -0
  71. evalscope/models/choice_adapter.py +211 -0
  72. evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
  73. evalscope/models/custom_adapter.py +67 -0
  74. evalscope/models/local_model.py +74 -0
  75. evalscope/models/model.py +141 -0
  76. evalscope/models/server_adapter.py +111 -0
  77. evalscope/perf/__init__.py +1 -0
  78. evalscope/perf/arguments.py +3 -1
  79. evalscope/perf/benchmark.py +3 -3
  80. evalscope/perf/main.py +5 -7
  81. evalscope/perf/plugin/api/custom_api.py +1 -1
  82. evalscope/perf/plugin/api/openai_api.py +54 -50
  83. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  84. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  85. evalscope/perf/plugin/registry.py +3 -3
  86. evalscope/perf/utils/benchmark_util.py +4 -4
  87. evalscope/perf/utils/db_util.py +66 -22
  88. evalscope/perf/utils/local_server.py +4 -1
  89. evalscope/report/__init__.py +5 -0
  90. evalscope/report/app.py +693 -0
  91. evalscope/report/combinator.py +73 -0
  92. evalscope/report/generator.py +80 -0
  93. evalscope/report/utils.py +133 -0
  94. evalscope/run.py +64 -125
  95. evalscope/run_arena.py +3 -2
  96. evalscope/summarizer.py +15 -27
  97. evalscope/third_party/longbench_write/eval.py +2 -1
  98. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  99. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  100. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  101. evalscope/utils/__init__.py +1 -0
  102. evalscope/utils/chat_service.py +6 -5
  103. evalscope/utils/io_utils.py +170 -0
  104. evalscope/utils/logger.py +13 -0
  105. evalscope/utils/model_utils.py +15 -2
  106. evalscope/utils/utils.py +3 -200
  107. evalscope/version.py +2 -2
  108. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
  109. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
  110. tests/cli/test_collection.py +57 -0
  111. tests/cli/test_run.py +57 -7
  112. tests/perf/test_perf.py +3 -2
  113. tests/rag/test_mteb.py +3 -2
  114. tests/vlm/test_vlmeval.py +3 -2
  115. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  116. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  117. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  118. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  119. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  120. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  121. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  122. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  123. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  124. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  125. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  126. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  127. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  128. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  129. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  130. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  131. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  132. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  133. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  134. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  135. evalscope/evaluator/humaneval_evaluator.py +0 -158
  136. evalscope/models/api/__init__.py +0 -3
  137. evalscope/models/dummy_chat_model.py +0 -49
  138. evalscope/models/model_adapter.py +0 -525
  139. evalscope/models/openai_model.py +0 -103
  140. evalscope/tools/__init__.py +0 -1
  141. evalscope/tools/combine_reports.py +0 -135
  142. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  143. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  144. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  145. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  146. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  147. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,135 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import glob
4
- import json
5
- import os
6
- from collections import defaultdict
7
- from tabulate import tabulate
8
-
9
- from evalscope.utils.logger import get_logger
10
-
11
- logger = get_logger()
12
- """
13
- Combine and generate table for reports of LLMs.
14
- """
15
-
16
-
17
- def get_report(report_file: str):
18
- data_d: dict = json.load(open(report_file, 'r'))
19
- dataset_name = data_d['dataset_name']
20
- model_name = data_d['model_name']
21
- score = data_d['score'] # float or dict
22
- score_d = {}
23
- if isinstance(score, dict):
24
- # score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
25
- score_d = score
26
- elif isinstance(score, float):
27
- # score_d['acc'] = round(score, 4) * 100
28
- score_d['acc'] = score
29
- else:
30
- raise ValueError(f'Unknown score type: {type(score)}')
31
- # score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
32
- score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
33
-
34
- return model_name, {'dataset_name': dataset_name, 'score': score_str}
35
-
36
-
37
- def get_model_reports(model_report_dir: str):
38
- model_report_dir = os.path.normpath(model_report_dir)
39
- report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
40
-
41
- model_reports_d = defaultdict(list)
42
- for file_path in report_files:
43
- model_name, report_d = get_report(file_path)
44
- model_reports_d[model_name].append(report_d)
45
-
46
- return model_reports_d
47
-
48
-
49
- def gen_table(reports_path_list: list):
50
- table_values = []
51
- headers = ['Model']
52
- is_headers_set = False
53
-
54
- for report_path in reports_path_list:
55
- model_reports_d = get_model_reports(report_path)
56
- for model_name, report_list in model_reports_d.items():
57
- report_list = sorted(report_list, key=lambda x: x['dataset_name'])
58
- if not is_headers_set:
59
- headers.extend([x['dataset_name'] for x in report_list])
60
- is_headers_set = True
61
- single_row = []
62
- single_row.append(model_name)
63
- for single_report in report_list:
64
- # e.g. '28.51 (acc)'
65
- single_row.append(single_report['score'])
66
- table_values.append(single_row)
67
-
68
- report_table = tabulate(table_values, headers=headers, tablefmt='grid')
69
- return report_table
70
-
71
-
72
- class ReportsRecorder:
73
- COMMON_DATASET_PATH = []
74
- CUSTOM_DATASET_PATH = []
75
-
76
- def __init__(self, oss_url: str = '', endpoint: str = ''):
77
- if oss_url and endpoint:
78
- import oss2
79
- from oss2.credentials import EnvironmentVariableCredentialsProvider
80
-
81
- auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
82
- oss_url = oss_url.replace('oss://', '').split('/')
83
- bucket_name = oss_url[0]
84
-
85
- self.object_path = '/'.join(oss_url[1:])
86
- self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
87
- else:
88
- self.object_path = ''
89
- self.bucket = None
90
-
91
- def append_path(self, report_path: str, dataset_name: str):
92
- if dataset_name == 'general_qa':
93
- self.CUSTOM_DATASET_PATH.append(report_path)
94
- else:
95
- self.COMMON_DATASET_PATH.append(report_path)
96
-
97
- def dump_reports(self, output_dir: str):
98
- result = {'CommonDataset': [], 'CustomDataset': []}
99
- for line in self.COMMON_DATASET_PATH:
100
- with open(line, 'r') as f:
101
- report = json.load(f)
102
- result['CommonDataset'].append(report)
103
- for line in self.CUSTOM_DATASET_PATH:
104
- with open(line, 'r') as f:
105
- report = json.load(f)
106
- report.update({'name': os.path.basename(line)})
107
- result['CustomDataset'].append(report)
108
-
109
- os.makedirs(output_dir, exist_ok=True)
110
- output_file_name = 'metric.json'
111
- output_path = os.path.join(output_dir, output_file_name)
112
- with open(output_path, 'w+') as f:
113
- f.write(json.dumps(result, ensure_ascii=False, indent=4))
114
-
115
- if self.bucket:
116
- remote_path = os.path.join(self.object_path, output_file_name)
117
- logger.info(f'** Upload report to oss: {remote_path}')
118
- self.bucket.put_object_from_file(remote_path, output_path)
119
-
120
-
121
- if __name__ == '__main__':
122
- report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
123
- report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
124
-
125
- report_table = gen_table([report_dir_1, report_dir_2])
126
- print(report_table)
127
-
128
- # ALL VALUES ONLY FOR EXAMPLE
129
- # +--------------------------+-------------------+-------------+
130
- # | Model | CompetitionMath | GSM8K |
131
- # +==========================+===================+=============+
132
- # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
133
- # +--------------------------+-------------------+-------------+
134
- # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
135
- # +--------------------------+-------------------+-------------+
@@ -1,90 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- # Note: refer to https://github.com/hendrycks/test/blob/master/categories.py
4
-
5
- subcategories = {
6
- 'abstract_algebra': ['math'],
7
- 'anatomy': ['health'],
8
- 'astronomy': ['physics'],
9
- 'business_ethics': ['business'],
10
- 'clinical_knowledge': ['health'],
11
- 'college_biology': ['biology'],
12
- 'college_chemistry': ['chemistry'],
13
- 'college_computer_science': ['computer science'],
14
- 'college_mathematics': ['math'],
15
- 'college_medicine': ['health'],
16
- 'college_physics': ['physics'],
17
- 'computer_security': ['computer science'],
18
- 'conceptual_physics': ['physics'],
19
- 'econometrics': ['economics'],
20
- 'electrical_engineering': ['engineering'],
21
- 'elementary_mathematics': ['math'],
22
- 'formal_logic': ['philosophy'],
23
- 'global_facts': ['other'],
24
- 'high_school_biology': ['biology'],
25
- 'high_school_chemistry': ['chemistry'],
26
- 'high_school_computer_science': ['computer science'],
27
- 'high_school_european_history': ['history'],
28
- 'high_school_geography': ['geography'],
29
- 'high_school_government_and_politics': ['politics'],
30
- 'high_school_macroeconomics': ['economics'],
31
- 'high_school_mathematics': ['math'],
32
- 'high_school_microeconomics': ['economics'],
33
- 'high_school_physics': ['physics'],
34
- 'high_school_psychology': ['psychology'],
35
- 'high_school_statistics': ['math'],
36
- 'high_school_us_history': ['history'],
37
- 'high_school_world_history': ['history'],
38
- 'human_aging': ['health'],
39
- 'human_sexuality': ['culture'],
40
- 'international_law': ['law'],
41
- 'jurisprudence': ['law'],
42
- 'logical_fallacies': ['philosophy'],
43
- 'machine_learning': ['computer science'],
44
- 'management': ['business'],
45
- 'marketing': ['business'],
46
- 'medical_genetics': ['health'],
47
- 'miscellaneous': ['other'],
48
- 'moral_disputes': ['philosophy'],
49
- 'moral_scenarios': ['philosophy'],
50
- 'nutrition': ['health'],
51
- 'philosophy': ['philosophy'],
52
- 'prehistory': ['history'],
53
- 'professional_accounting': ['other'],
54
- 'professional_law': ['law'],
55
- 'professional_medicine': ['health'],
56
- 'professional_psychology': ['psychology'],
57
- 'public_relations': ['politics'],
58
- 'security_studies': ['politics'],
59
- 'sociology': ['culture'],
60
- 'us_foreign_policy': ['politics'],
61
- 'virology': ['health'],
62
- 'world_religions': ['philosophy'],
63
- }
64
-
65
- categories = {
66
- 'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
67
- 'Humanities': ['history', 'philosophy', 'law'],
68
- 'Social Science': ['politics', 'culture', 'economics', 'geography', 'psychology'],
69
- 'Other': ['other', 'business', 'health'],
70
- }
71
-
72
-
73
- def main():
74
-
75
- reversed_categories = {}
76
- for category, subcategory_list in categories.items():
77
- for subcategory in subcategory_list:
78
- reversed_categories[subcategory] = category
79
-
80
- subject_mapping = {}
81
- for subject, subcategory_list in subcategories.items():
82
- category_name: str = reversed_categories[subcategory_list[0]]
83
- subject_show_name: str = ' '.join([item.capitalize() for item in subject.split('_')])
84
- subject_mapping[subject] = [subject_show_name, subcategory_list[0], category_name]
85
-
86
- print(subject_mapping)
87
-
88
-
89
- if __name__ == '__main__':
90
- main()