evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +27 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +45 -7
- evalscope/constants.py +7 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +89 -121
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +140 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/arguments.py +3 -1
- evalscope/perf/benchmark.py +3 -3
- evalscope/perf/main.py +5 -7
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +54 -50
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +4 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +64 -125
- evalscope/run_arena.py +3 -2
- evalscope/summarizer.py +15 -27
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +6 -5
- evalscope/utils/io_utils.py +170 -0
- evalscope/utils/logger.py +13 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -200
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +57 -7
- tests/perf/test_perf.py +3 -2
- tests/rag/test_mteb.py +3 -2
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -135
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -1,135 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
import glob
|
|
4
|
-
import json
|
|
5
|
-
import os
|
|
6
|
-
from collections import defaultdict
|
|
7
|
-
from tabulate import tabulate
|
|
8
|
-
|
|
9
|
-
from evalscope.utils.logger import get_logger
|
|
10
|
-
|
|
11
|
-
logger = get_logger()
|
|
12
|
-
"""
|
|
13
|
-
Combine and generate table for reports of LLMs.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def get_report(report_file: str):
|
|
18
|
-
data_d: dict = json.load(open(report_file, 'r'))
|
|
19
|
-
dataset_name = data_d['dataset_name']
|
|
20
|
-
model_name = data_d['model_name']
|
|
21
|
-
score = data_d['score'] # float or dict
|
|
22
|
-
score_d = {}
|
|
23
|
-
if isinstance(score, dict):
|
|
24
|
-
# score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
|
|
25
|
-
score_d = score
|
|
26
|
-
elif isinstance(score, float):
|
|
27
|
-
# score_d['acc'] = round(score, 4) * 100
|
|
28
|
-
score_d['acc'] = score
|
|
29
|
-
else:
|
|
30
|
-
raise ValueError(f'Unknown score type: {type(score)}')
|
|
31
|
-
# score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
|
|
32
|
-
score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
|
|
33
|
-
|
|
34
|
-
return model_name, {'dataset_name': dataset_name, 'score': score_str}
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def get_model_reports(model_report_dir: str):
|
|
38
|
-
model_report_dir = os.path.normpath(model_report_dir)
|
|
39
|
-
report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
|
|
40
|
-
|
|
41
|
-
model_reports_d = defaultdict(list)
|
|
42
|
-
for file_path in report_files:
|
|
43
|
-
model_name, report_d = get_report(file_path)
|
|
44
|
-
model_reports_d[model_name].append(report_d)
|
|
45
|
-
|
|
46
|
-
return model_reports_d
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def gen_table(reports_path_list: list):
|
|
50
|
-
table_values = []
|
|
51
|
-
headers = ['Model']
|
|
52
|
-
is_headers_set = False
|
|
53
|
-
|
|
54
|
-
for report_path in reports_path_list:
|
|
55
|
-
model_reports_d = get_model_reports(report_path)
|
|
56
|
-
for model_name, report_list in model_reports_d.items():
|
|
57
|
-
report_list = sorted(report_list, key=lambda x: x['dataset_name'])
|
|
58
|
-
if not is_headers_set:
|
|
59
|
-
headers.extend([x['dataset_name'] for x in report_list])
|
|
60
|
-
is_headers_set = True
|
|
61
|
-
single_row = []
|
|
62
|
-
single_row.append(model_name)
|
|
63
|
-
for single_report in report_list:
|
|
64
|
-
# e.g. '28.51 (acc)'
|
|
65
|
-
single_row.append(single_report['score'])
|
|
66
|
-
table_values.append(single_row)
|
|
67
|
-
|
|
68
|
-
report_table = tabulate(table_values, headers=headers, tablefmt='grid')
|
|
69
|
-
return report_table
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class ReportsRecorder:
|
|
73
|
-
COMMON_DATASET_PATH = []
|
|
74
|
-
CUSTOM_DATASET_PATH = []
|
|
75
|
-
|
|
76
|
-
def __init__(self, oss_url: str = '', endpoint: str = ''):
|
|
77
|
-
if oss_url and endpoint:
|
|
78
|
-
import oss2
|
|
79
|
-
from oss2.credentials import EnvironmentVariableCredentialsProvider
|
|
80
|
-
|
|
81
|
-
auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
|
|
82
|
-
oss_url = oss_url.replace('oss://', '').split('/')
|
|
83
|
-
bucket_name = oss_url[0]
|
|
84
|
-
|
|
85
|
-
self.object_path = '/'.join(oss_url[1:])
|
|
86
|
-
self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
|
|
87
|
-
else:
|
|
88
|
-
self.object_path = ''
|
|
89
|
-
self.bucket = None
|
|
90
|
-
|
|
91
|
-
def append_path(self, report_path: str, dataset_name: str):
|
|
92
|
-
if dataset_name == 'general_qa':
|
|
93
|
-
self.CUSTOM_DATASET_PATH.append(report_path)
|
|
94
|
-
else:
|
|
95
|
-
self.COMMON_DATASET_PATH.append(report_path)
|
|
96
|
-
|
|
97
|
-
def dump_reports(self, output_dir: str):
|
|
98
|
-
result = {'CommonDataset': [], 'CustomDataset': []}
|
|
99
|
-
for line in self.COMMON_DATASET_PATH:
|
|
100
|
-
with open(line, 'r') as f:
|
|
101
|
-
report = json.load(f)
|
|
102
|
-
result['CommonDataset'].append(report)
|
|
103
|
-
for line in self.CUSTOM_DATASET_PATH:
|
|
104
|
-
with open(line, 'r') as f:
|
|
105
|
-
report = json.load(f)
|
|
106
|
-
report.update({'name': os.path.basename(line)})
|
|
107
|
-
result['CustomDataset'].append(report)
|
|
108
|
-
|
|
109
|
-
os.makedirs(output_dir, exist_ok=True)
|
|
110
|
-
output_file_name = 'metric.json'
|
|
111
|
-
output_path = os.path.join(output_dir, output_file_name)
|
|
112
|
-
with open(output_path, 'w+') as f:
|
|
113
|
-
f.write(json.dumps(result, ensure_ascii=False, indent=4))
|
|
114
|
-
|
|
115
|
-
if self.bucket:
|
|
116
|
-
remote_path = os.path.join(self.object_path, output_file_name)
|
|
117
|
-
logger.info(f'** Upload report to oss: {remote_path}')
|
|
118
|
-
self.bucket.put_object_from_file(remote_path, output_path)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if __name__ == '__main__':
|
|
122
|
-
report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
|
|
123
|
-
report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
|
|
124
|
-
|
|
125
|
-
report_table = gen_table([report_dir_1, report_dir_2])
|
|
126
|
-
print(report_table)
|
|
127
|
-
|
|
128
|
-
# ALL VALUES ONLY FOR EXAMPLE
|
|
129
|
-
# +--------------------------+-------------------+-------------+
|
|
130
|
-
# | Model | CompetitionMath | GSM8K |
|
|
131
|
-
# +==========================+===================+=============+
|
|
132
|
-
# | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
|
|
133
|
-
# +--------------------------+-------------------+-------------+
|
|
134
|
-
# | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
|
|
135
|
-
# +--------------------------+-------------------+-------------+
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
# Note: refer to https://github.com/hendrycks/test/blob/master/categories.py
|
|
4
|
-
|
|
5
|
-
subcategories = {
|
|
6
|
-
'abstract_algebra': ['math'],
|
|
7
|
-
'anatomy': ['health'],
|
|
8
|
-
'astronomy': ['physics'],
|
|
9
|
-
'business_ethics': ['business'],
|
|
10
|
-
'clinical_knowledge': ['health'],
|
|
11
|
-
'college_biology': ['biology'],
|
|
12
|
-
'college_chemistry': ['chemistry'],
|
|
13
|
-
'college_computer_science': ['computer science'],
|
|
14
|
-
'college_mathematics': ['math'],
|
|
15
|
-
'college_medicine': ['health'],
|
|
16
|
-
'college_physics': ['physics'],
|
|
17
|
-
'computer_security': ['computer science'],
|
|
18
|
-
'conceptual_physics': ['physics'],
|
|
19
|
-
'econometrics': ['economics'],
|
|
20
|
-
'electrical_engineering': ['engineering'],
|
|
21
|
-
'elementary_mathematics': ['math'],
|
|
22
|
-
'formal_logic': ['philosophy'],
|
|
23
|
-
'global_facts': ['other'],
|
|
24
|
-
'high_school_biology': ['biology'],
|
|
25
|
-
'high_school_chemistry': ['chemistry'],
|
|
26
|
-
'high_school_computer_science': ['computer science'],
|
|
27
|
-
'high_school_european_history': ['history'],
|
|
28
|
-
'high_school_geography': ['geography'],
|
|
29
|
-
'high_school_government_and_politics': ['politics'],
|
|
30
|
-
'high_school_macroeconomics': ['economics'],
|
|
31
|
-
'high_school_mathematics': ['math'],
|
|
32
|
-
'high_school_microeconomics': ['economics'],
|
|
33
|
-
'high_school_physics': ['physics'],
|
|
34
|
-
'high_school_psychology': ['psychology'],
|
|
35
|
-
'high_school_statistics': ['math'],
|
|
36
|
-
'high_school_us_history': ['history'],
|
|
37
|
-
'high_school_world_history': ['history'],
|
|
38
|
-
'human_aging': ['health'],
|
|
39
|
-
'human_sexuality': ['culture'],
|
|
40
|
-
'international_law': ['law'],
|
|
41
|
-
'jurisprudence': ['law'],
|
|
42
|
-
'logical_fallacies': ['philosophy'],
|
|
43
|
-
'machine_learning': ['computer science'],
|
|
44
|
-
'management': ['business'],
|
|
45
|
-
'marketing': ['business'],
|
|
46
|
-
'medical_genetics': ['health'],
|
|
47
|
-
'miscellaneous': ['other'],
|
|
48
|
-
'moral_disputes': ['philosophy'],
|
|
49
|
-
'moral_scenarios': ['philosophy'],
|
|
50
|
-
'nutrition': ['health'],
|
|
51
|
-
'philosophy': ['philosophy'],
|
|
52
|
-
'prehistory': ['history'],
|
|
53
|
-
'professional_accounting': ['other'],
|
|
54
|
-
'professional_law': ['law'],
|
|
55
|
-
'professional_medicine': ['health'],
|
|
56
|
-
'professional_psychology': ['psychology'],
|
|
57
|
-
'public_relations': ['politics'],
|
|
58
|
-
'security_studies': ['politics'],
|
|
59
|
-
'sociology': ['culture'],
|
|
60
|
-
'us_foreign_policy': ['politics'],
|
|
61
|
-
'virology': ['health'],
|
|
62
|
-
'world_religions': ['philosophy'],
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
categories = {
|
|
66
|
-
'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
|
|
67
|
-
'Humanities': ['history', 'philosophy', 'law'],
|
|
68
|
-
'Social Science': ['politics', 'culture', 'economics', 'geography', 'psychology'],
|
|
69
|
-
'Other': ['other', 'business', 'health'],
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def main():
|
|
74
|
-
|
|
75
|
-
reversed_categories = {}
|
|
76
|
-
for category, subcategory_list in categories.items():
|
|
77
|
-
for subcategory in subcategory_list:
|
|
78
|
-
reversed_categories[subcategory] = category
|
|
79
|
-
|
|
80
|
-
subject_mapping = {}
|
|
81
|
-
for subject, subcategory_list in subcategories.items():
|
|
82
|
-
category_name: str = reversed_categories[subcategory_list[0]]
|
|
83
|
-
subject_show_name: str = ' '.join([item.capitalize() for item in subject.split('_')])
|
|
84
|
-
subject_mapping[subject] = [subject_show_name, subcategory_list[0], category_name]
|
|
85
|
-
|
|
86
|
-
print(subject_mapping)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
if __name__ == '__main__':
|
|
90
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|