evalscope 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +3 -0
- evalscope/backend/__init__.py +3 -0
- evalscope/backend/base.py +27 -0
- evalscope/backend/opencompass/__init__.py +3 -0
- evalscope/backend/opencompass/api_meta_template.py +64 -0
- evalscope/backend/opencompass/backend_manager.py +247 -0
- evalscope/backend/opencompass/tasks/__init__.py +1 -0
- evalscope/backend/opencompass/tasks/eval_api.py +30 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
- evalscope/backend/vlm_eval_kit/__init__.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
- evalscope/benchmarks/__init__.py +4 -0
- evalscope/benchmarks/arc/__init__.py +5 -0
- evalscope/benchmarks/arc/ai2_arc.py +148 -0
- evalscope/benchmarks/arc/arc_adapter.py +231 -0
- evalscope/benchmarks/bbh/__init__.py +6 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
- evalscope/benchmarks/benchmark.py +65 -0
- evalscope/benchmarks/ceval/__init__.py +5 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
- evalscope/benchmarks/ceval/ceval_exam.py +159 -0
- evalscope/benchmarks/cmmlu/__init__.py +5 -0
- evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
- evalscope/benchmarks/competition_math/__init__.py +5 -0
- evalscope/benchmarks/competition_math/competition_math.py +88 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
- evalscope/benchmarks/data_adapter.py +263 -0
- evalscope/benchmarks/general_qa/__init__.py +5 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
- evalscope/benchmarks/gsm8k/__init__.py +5 -0
- evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
- evalscope/benchmarks/hellaswag/__init__.py +5 -0
- evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
- evalscope/benchmarks/humaneval/__init__.py +5 -0
- evalscope/benchmarks/humaneval/humaneval.py +82 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
- evalscope/benchmarks/mmlu/__init__.py +5 -0
- evalscope/benchmarks/mmlu/mmlu.py +174 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
- evalscope/benchmarks/race/__init__.py +5 -0
- evalscope/benchmarks/race/race.py +118 -0
- evalscope/benchmarks/race/race_adapter.py +229 -0
- evalscope/benchmarks/trivia_qa/__init__.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
- evalscope/benchmarks/truthful_qa/__init__.py +5 -0
- evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
- evalscope/cache.py +98 -0
- evalscope/cli/__init__.py +1 -0
- evalscope/cli/base.py +20 -0
- evalscope/cli/cli.py +26 -0
- evalscope/cli/start_perf.py +37 -0
- evalscope/cli/start_server.py +138 -0
- evalscope/config.py +165 -0
- evalscope/constants.py +150 -0
- evalscope/evaluator/__init__.py +3 -0
- evalscope/evaluator/evaluator.py +689 -0
- evalscope/evaluator/rating_eval.py +178 -0
- evalscope/evaluator/reviewer/__init__.py +1 -0
- evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
- evalscope/metrics/__init__.py +1 -0
- evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
- evalscope/metrics/code_metric.py +104 -0
- evalscope/metrics/math_accuracy.py +60 -0
- evalscope/metrics/metrics.py +405 -0
- evalscope/metrics/rouge_metric.py +129 -0
- evalscope/models/__init__.py +4 -0
- evalscope/models/custom/__init__.py +4 -0
- evalscope/models/custom/custom_model.py +53 -0
- evalscope/models/dummy_chat_model.py +50 -0
- evalscope/models/model.py +88 -0
- evalscope/models/model_adapter.py +586 -0
- evalscope/models/openai_model.py +103 -0
- evalscope/models/template.py +1446 -0
- evalscope/perf/__init__.py +0 -0
- evalscope/perf/_logging.py +32 -0
- evalscope/perf/api_plugin_base.py +60 -0
- evalscope/perf/custom_api.py +87 -0
- evalscope/perf/dashscope_api.py +84 -0
- evalscope/perf/dataset_plugin_base.py +64 -0
- evalscope/perf/datasets/__init__.py +0 -0
- evalscope/perf/datasets/line_by_line.py +18 -0
- evalscope/perf/datasets/longalpaca_12k.py +20 -0
- evalscope/perf/datasets/openqa.py +22 -0
- evalscope/perf/how_to_analysis_result.py +24 -0
- evalscope/perf/http_client.py +756 -0
- evalscope/perf/openai_api.py +130 -0
- evalscope/perf/plugin_registry.py +35 -0
- evalscope/perf/query_parameters.py +42 -0
- evalscope/perf/server_sent_event.py +43 -0
- evalscope/preprocess/__init__.py +1 -0
- evalscope/preprocess/tokenizers/__init__.py +0 -0
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
- evalscope/registry/__init__.py +1 -0
- evalscope/registry/tasks/arc.yaml +29 -0
- evalscope/registry/tasks/bbh.yaml +27 -0
- evalscope/registry/tasks/bbh_mini.yaml +27 -0
- evalscope/registry/tasks/ceval.yaml +27 -0
- evalscope/registry/tasks/ceval_mini.yaml +27 -0
- evalscope/registry/tasks/cmmlu.yaml +27 -0
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
- evalscope/registry/tasks/general_qa.yaml +27 -0
- evalscope/registry/tasks/gsm8k.yaml +29 -0
- evalscope/registry/tasks/mmlu.yaml +29 -0
- evalscope/registry/tasks/mmlu_mini.yaml +27 -0
- evalscope/run.py +404 -0
- evalscope/run_arena.py +204 -0
- evalscope/run_ms.py +140 -0
- evalscope/summarizer.py +144 -0
- evalscope/third_party/__init__.py +1 -0
- evalscope/third_party/toolbench_static/__init__.py +3 -0
- evalscope/third_party/toolbench_static/eval.py +219 -0
- evalscope/third_party/toolbench_static/infer.py +278 -0
- evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
- evalscope/tools/__init__.py +1 -0
- evalscope/tools/combine_reports.py +140 -0
- evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
- evalscope/tools/rewrite_eval_results.py +95 -0
- evalscope/utils/__init__.py +4 -0
- evalscope/utils/arena_utils.py +247 -0
- evalscope/utils/completion_parsers.py +87 -0
- evalscope/utils/logger.py +64 -0
- evalscope/utils/task_cfg_parser.py +10 -0
- evalscope/utils/task_utils.py +19 -0
- evalscope/utils/utils.py +625 -0
- evalscope/version.py +4 -0
- evalscope-0.5.0.dist-info/METADATA +566 -0
- evalscope-0.5.0.dist-info/RECORD +165 -0
- evalscope-0.5.0.dist-info/WHEEL +5 -0
- evalscope-0.5.0.dist-info/entry_points.txt +3 -0
- evalscope-0.5.0.dist-info/top_level.txt +1 -0
evalscope/utils/utils.py
ADDED
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# Copyright (c) OpenCompass.
|
|
3
|
+
|
|
4
|
+
import functools
|
|
5
|
+
import importlib
|
|
6
|
+
import importlib.util
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import json
|
|
10
|
+
import random
|
|
11
|
+
import sys
|
|
12
|
+
from typing import Any, Union, Dict, Tuple, List
|
|
13
|
+
import hashlib
|
|
14
|
+
import torch.nn.functional as F
|
|
15
|
+
|
|
16
|
+
import jsonlines as jsonl
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
from evalscope.constants import DumpMode, OutputsStructure
|
|
20
|
+
from evalscope.utils.logger import get_logger
|
|
21
|
+
|
|
22
|
+
logger = get_logger()
|
|
23
|
+
|
|
24
|
+
TEST_LEVEL_LIST = [0, 1]
|
|
25
|
+
|
|
26
|
+
# Example: export TEST_LEVEL_LIST=0,1
|
|
27
|
+
TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_level_list():
|
|
31
|
+
global TEST_LEVEL_LIST
|
|
32
|
+
if TEST_LEVEL_LIST_STR in os.environ:
|
|
33
|
+
TEST_LEVEL_LIST = [
|
|
34
|
+
int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
return TEST_LEVEL_LIST
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def jsonl_to_list(jsonl_file):
|
|
41
|
+
"""
|
|
42
|
+
Read jsonl file to list.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
jsonl_file: jsonl file path.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
list: list of lines. Each line is a dict.
|
|
49
|
+
"""
|
|
50
|
+
res_list = []
|
|
51
|
+
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
52
|
+
for line in reader.iter(
|
|
53
|
+
type=dict, allow_none=True, skip_invalid=False):
|
|
54
|
+
res_list.append(line)
|
|
55
|
+
return res_list
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def jsonl_to_reader(jsonl_file):
|
|
59
|
+
"""
|
|
60
|
+
Read jsonl file to reader object.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
jsonl_file: jsonl file path.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
reader: jsonl reader object.
|
|
67
|
+
"""
|
|
68
|
+
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
69
|
+
return reader
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def jsonl_to_csv():
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
|
|
77
|
+
"""
|
|
78
|
+
Dump data to jsonl file.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
data_list: data list to be dumped. [{'a': 'aaa'}, ...]
|
|
82
|
+
jsonl_file: jsonl file path.
|
|
83
|
+
dump_mode: dump mode. It can be 'overwrite' or 'append'.
|
|
84
|
+
"""
|
|
85
|
+
if not jsonl_file:
|
|
86
|
+
raise ValueError('output file must be provided.')
|
|
87
|
+
|
|
88
|
+
jsonl_file = os.path.expanduser(jsonl_file)
|
|
89
|
+
|
|
90
|
+
if dump_mode == DumpMode.OVERWRITE:
|
|
91
|
+
dump_mode = 'w'
|
|
92
|
+
elif dump_mode == DumpMode.APPEND:
|
|
93
|
+
dump_mode = 'a'
|
|
94
|
+
with jsonl.open(jsonl_file, mode=dump_mode) as writer:
|
|
95
|
+
writer.write_all(data_list)
|
|
96
|
+
logger.info(f'Dump data to {jsonl_file} successfully.')
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def yaml_to_dict(yaml_file) -> dict:
|
|
100
|
+
"""
|
|
101
|
+
Read yaml file to dict.
|
|
102
|
+
"""
|
|
103
|
+
with open(yaml_file, 'r') as f:
|
|
104
|
+
try:
|
|
105
|
+
stream = yaml.safe_load(f)
|
|
106
|
+
except yaml.YAMLError as e:
|
|
107
|
+
logger.error(f'{e}')
|
|
108
|
+
raise e
|
|
109
|
+
|
|
110
|
+
return stream
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def dict_to_yaml(d: dict, yaml_file: str):
|
|
114
|
+
"""
|
|
115
|
+
Dump dict to yaml file.
|
|
116
|
+
"""
|
|
117
|
+
with open(yaml_file, 'w') as f:
|
|
118
|
+
yaml.dump(d, f, default_flow_style=False)
|
|
119
|
+
logger.info(f'Dump data to {yaml_file} successfully.')
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def json_to_dict(json_file) -> dict:
|
|
123
|
+
"""
|
|
124
|
+
Read json file to dict.
|
|
125
|
+
"""
|
|
126
|
+
with open(json_file, 'r') as f:
|
|
127
|
+
try:
|
|
128
|
+
stream = json.load(f)
|
|
129
|
+
except json.JSONDecodeError as e:
|
|
130
|
+
logger.error(f'{e}')
|
|
131
|
+
raise e
|
|
132
|
+
|
|
133
|
+
return stream
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
|
|
137
|
+
module_name, spliter, cls_name = eval_class_ref.partition(':')
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
obj_cls = importlib.import_module(module_name)
|
|
141
|
+
except ImportError as e:
|
|
142
|
+
logger.error(f'{e}')
|
|
143
|
+
raise e
|
|
144
|
+
|
|
145
|
+
if spliter:
|
|
146
|
+
for attr in cls_name.split('.'):
|
|
147
|
+
obj_cls = getattr(obj_cls, attr)
|
|
148
|
+
|
|
149
|
+
return functools.partial(obj_cls, *args, **kwargs)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def markdown_table(header_l, data_l):
|
|
153
|
+
md_str = f'| {" | ".join(header_l)} |'
|
|
154
|
+
md_str += f'\n| {" | ".join(["---"] * len(header_l))} |'
|
|
155
|
+
for data in data_l:
|
|
156
|
+
if isinstance(data, str):
|
|
157
|
+
data = [data]
|
|
158
|
+
assert len(data) <= len(header_l)
|
|
159
|
+
tmp = data + [''] * (len(header_l) - len(data))
|
|
160
|
+
md_str += f'\n| {" | ".join(tmp)} |'
|
|
161
|
+
return md_str
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
|
|
165
|
+
"""Random choice with a (potentially string) seed."""
|
|
166
|
+
return random.Random(seed).choices(choices, k=1, **kwargs)[0]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def gen_hash(name: str):
|
|
170
|
+
return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
|
|
174
|
+
"""
|
|
175
|
+
Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
|
|
176
|
+
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
|
|
177
|
+
string, which can then be stored in the json format.
|
|
178
|
+
|
|
179
|
+
Refer to: https://github.com/huggingface/transformers/pull/16065/files for details.
|
|
180
|
+
"""
|
|
181
|
+
if d.get('torch_dtype', None) is not None and not isinstance(d['torch_dtype'], str):
|
|
182
|
+
d['torch_dtype'] = str(d['torch_dtype']).split('.')[1]
|
|
183
|
+
|
|
184
|
+
for value in d.values():
|
|
185
|
+
if isinstance(value, dict):
|
|
186
|
+
dict_torch_dtype_to_str(value)
|
|
187
|
+
|
|
188
|
+
return d
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class ResponseParser:
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def parse_first_capital(text: str) -> str:
|
|
195
|
+
for t in text:
|
|
196
|
+
if t.isupper():
|
|
197
|
+
return t
|
|
198
|
+
return ''
|
|
199
|
+
|
|
200
|
+
@staticmethod
|
|
201
|
+
def parse_last_capital(text: str) -> str:
|
|
202
|
+
for t in text[::-1]:
|
|
203
|
+
if t.isupper():
|
|
204
|
+
return t
|
|
205
|
+
return ''
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def parse_first_option_with_choices(text: str, options: list) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Find first valid option for text.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
text: The text to parse.
|
|
214
|
+
options: The options to find. e.g. ['A', 'B', 'C', 'D']
|
|
215
|
+
"""
|
|
216
|
+
options_concat = '|'.join([str(i) for i in options])
|
|
217
|
+
|
|
218
|
+
patterns = [
|
|
219
|
+
f'答案是?\s?([{options_concat}])',
|
|
220
|
+
f'答案是?\s?:([{options_concat}])',
|
|
221
|
+
f'答案是?\s?:([{options_concat}])',
|
|
222
|
+
f'答案应该?是\s?([{options_concat}])',
|
|
223
|
+
f'答案应该?选\s?([{options_concat}])',
|
|
224
|
+
f'答案为\s?([{options_concat}])',
|
|
225
|
+
f'答案选\s?([{options_concat}])',
|
|
226
|
+
f'选择?\s?([{options_concat}])',
|
|
227
|
+
f'故选?\s?([{options_concat}])'
|
|
228
|
+
f'只有选?项?\s?([{options_concat}])\s?是?对',
|
|
229
|
+
f'只有选?项?\s?([{options_concat}])\s?是?错',
|
|
230
|
+
f'只有选?项?\s?([{options_concat}])\s?不?正确',
|
|
231
|
+
f'只有选?项?\s?([{options_concat}])\s?错误',
|
|
232
|
+
f'说法不?对选?项?的?是\s?([{options_concat}])',
|
|
233
|
+
f'说法不?正确选?项?的?是\s?([{options_concat}])',
|
|
234
|
+
f'说法错误选?项?的?是\s?([{options_concat}])',
|
|
235
|
+
f'([{options_concat}])\s?是正确的',
|
|
236
|
+
f'([{options_concat}])\s?是正确答案',
|
|
237
|
+
f'选项\s?([{options_concat}])\s?正确',
|
|
238
|
+
f'所以答\s?([{options_concat}])',
|
|
239
|
+
f'1.\s?([{options_concat}])[.。$]?$',
|
|
240
|
+
f'所以\s?([{options_concat}][.。$]?$)',
|
|
241
|
+
f'所有\s?([{options_concat}][.。$]?$)',
|
|
242
|
+
f'[\s,::,]([{options_concat}])[。,,\.]?$',
|
|
243
|
+
f'[\s,,::][故即]([{options_concat}])[。\.]?$',
|
|
244
|
+
f'[\s,,::]因此([{options_concat}])[。\.]?$',
|
|
245
|
+
f'[是为。]\s?([{options_concat}])[。\.]?$',
|
|
246
|
+
f'因此\s?([{options_concat}])[。\.]?$',
|
|
247
|
+
f'显然\s?([{options_concat}])[。\.]?$',
|
|
248
|
+
f'答案是\s?(\S+)(?:。|$)',
|
|
249
|
+
f'答案应该是\s?(\S+)(?:。|$)',
|
|
250
|
+
f'答案为\s?(\S+)(?:。|$)',
|
|
251
|
+
f'答案是(.*?)[{options_concat}]',
|
|
252
|
+
f'答案为(.*?)[{options_concat}]',
|
|
253
|
+
f'固选(.*?)[{options_concat}]',
|
|
254
|
+
f'答案应该是(.*?)[{options_concat}]',
|
|
255
|
+
f'[Tt]he answer is [{options_concat}]',
|
|
256
|
+
f'[Tt]he correct answer is [{options_concat}]',
|
|
257
|
+
f'[Tt]he correct answer is:\n[{options_concat}]',
|
|
258
|
+
f'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
|
|
259
|
+
f'[{options_concat}]',
|
|
260
|
+
f'^选项\s?([{options_concat}])',
|
|
261
|
+
f'^([{options_concat}])\s?选?项',
|
|
262
|
+
f'(\s|^)[{options_concat}][\s。,,::\.$]',
|
|
263
|
+
f'(\s|^)[{options_concat}](\s|$)',
|
|
264
|
+
f'1.\s?(.*?)$',
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
regexes = [re.compile(pattern) for pattern in patterns]
|
|
268
|
+
for regex in regexes:
|
|
269
|
+
match = regex.search(text)
|
|
270
|
+
if match:
|
|
271
|
+
outputs = match.group(0)
|
|
272
|
+
for i in options:
|
|
273
|
+
if i in outputs:
|
|
274
|
+
return i
|
|
275
|
+
return ''
|
|
276
|
+
|
|
277
|
+
@staticmethod
|
|
278
|
+
def parse_first_option(text: str) -> str:
|
|
279
|
+
"""
|
|
280
|
+
Find first valid option for text.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
text: The text to parse.
|
|
284
|
+
"""
|
|
285
|
+
patterns = [
|
|
286
|
+
r"[Aa]nswer:\s*(\w+)",
|
|
287
|
+
r"[Tt]he correct answer is:\s*(\w+)",
|
|
288
|
+
r"[Tt]he correct answer is:\n\s*(\w+)",
|
|
289
|
+
r"[Tt]he correct answer is:\n\n-\s*(\w+)",
|
|
290
|
+
r"[Tt]he answer might be:\n\n-\s*(\w+)",
|
|
291
|
+
r"[Tt]he answer is \s*(\w+)",
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
regexes = [re.compile(pattern) for pattern in patterns]
|
|
295
|
+
for regex in regexes:
|
|
296
|
+
match = regex.search(text)
|
|
297
|
+
if match:
|
|
298
|
+
return match.group(1)
|
|
299
|
+
return ''
|
|
300
|
+
|
|
301
|
+
@staticmethod
|
|
302
|
+
def parse_first_capital_multi(text: str) -> str:
|
|
303
|
+
match = re.search(r'([A-D]+)', text)
|
|
304
|
+
if match:
|
|
305
|
+
return match.group(1)
|
|
306
|
+
return ''
|
|
307
|
+
|
|
308
|
+
@staticmethod
|
|
309
|
+
def parse_last_option(text: str, options: str) -> str:
|
|
310
|
+
match = re.findall(rf'([{options}])', text)
|
|
311
|
+
if match:
|
|
312
|
+
return match[-1]
|
|
313
|
+
return ''
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def make_outputs_dir(root_dir: str, datasets: list, model_id: str, model_revision: str):
|
|
317
|
+
# model_revision = model_revision if model_revision is not None else 'none'
|
|
318
|
+
# now = datetime.datetime.now()
|
|
319
|
+
# format_time = now.strftime('%Y%m%d_%H%M%S')
|
|
320
|
+
# outputs_name = format_time + '_' + 'default' + '_' + model_id.replace('/', '_') + '_' + model_revision
|
|
321
|
+
# outputs_dir = os.path.join(work_dir, outputs_name)
|
|
322
|
+
# dataset_name = dataset_id.replace('/', '_')
|
|
323
|
+
# outputs_dir = os.path.join(work_dir, dataset_name)
|
|
324
|
+
|
|
325
|
+
if not model_id:
|
|
326
|
+
model_id = 'default'
|
|
327
|
+
model_id = model_id.replace('/', '_')
|
|
328
|
+
|
|
329
|
+
if not model_revision:
|
|
330
|
+
model_revision = 'default'
|
|
331
|
+
|
|
332
|
+
outputs_dir = os.path.join(root_dir,
|
|
333
|
+
f"eval_{'-'.join(datasets)}_{model_id}_{model_revision}")
|
|
334
|
+
|
|
335
|
+
return outputs_dir
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def process_outputs_structure(outputs_dir: str, is_make: bool = True) -> dict:
|
|
339
|
+
logs_dir = os.path.join(outputs_dir, 'logs')
|
|
340
|
+
predictions_dir = os.path.join(outputs_dir, 'predictions')
|
|
341
|
+
reviews_dir = os.path.join(outputs_dir, 'reviews')
|
|
342
|
+
reports_dir = os.path.join(outputs_dir, 'reports')
|
|
343
|
+
configs_dir = os.path.join(outputs_dir, 'configs')
|
|
344
|
+
|
|
345
|
+
if is_make:
|
|
346
|
+
os.makedirs(outputs_dir, exist_ok=True)
|
|
347
|
+
os.makedirs(logs_dir, exist_ok=True)
|
|
348
|
+
os.makedirs(predictions_dir, exist_ok=True)
|
|
349
|
+
os.makedirs(reviews_dir, exist_ok=True)
|
|
350
|
+
os.makedirs(reports_dir, exist_ok=True)
|
|
351
|
+
os.makedirs(configs_dir, exist_ok=True)
|
|
352
|
+
|
|
353
|
+
outputs_structure = {
|
|
354
|
+
OutputsStructure.LOGS_DIR: logs_dir,
|
|
355
|
+
OutputsStructure.PREDICTIONS_DIR: predictions_dir,
|
|
356
|
+
OutputsStructure.REVIEWS_DIR: reviews_dir,
|
|
357
|
+
OutputsStructure.REPORTS_DIR: reports_dir,
|
|
358
|
+
OutputsStructure.CONFIGS_DIR: configs_dir,
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
return outputs_structure
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
|
|
365
|
+
"""
|
|
366
|
+
Import module utility function.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
import_path_prefix: e.g. 'evalscope.benchmarks.'
|
|
370
|
+
module_name: The module name to import. e.g. 'mmlu'
|
|
371
|
+
members_to_import: The members to import.
|
|
372
|
+
e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass']
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...}
|
|
376
|
+
"""
|
|
377
|
+
imported_modules = {}
|
|
378
|
+
module = importlib.import_module(import_path_prefix + module_name)
|
|
379
|
+
for member_name in members_to_import:
|
|
380
|
+
imported_modules[member_name] = getattr(module, member_name)
|
|
381
|
+
|
|
382
|
+
return imported_modules
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
386
|
+
"""
|
|
387
|
+
Normalize score.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
|
|
391
|
+
keep_num: number of digits to keep.
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
|
|
395
|
+
"""
|
|
396
|
+
if isinstance(score, float):
|
|
397
|
+
score = round(score, keep_num)
|
|
398
|
+
elif isinstance(score, dict):
|
|
399
|
+
score = {k: round(v, keep_num) for k, v in score.items()}
|
|
400
|
+
else:
|
|
401
|
+
logger.warning(f'Unknown score type: {type(score)}')
|
|
402
|
+
|
|
403
|
+
return score
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def split_str_parts_by(text: str, delimiters: List[str]):
|
|
407
|
+
"""Split the text field into parts.
|
|
408
|
+
Args:
|
|
409
|
+
text: A text to be split.
|
|
410
|
+
delimiters: The delimiters.
|
|
411
|
+
Returns:
|
|
412
|
+
The split text in list of dicts.
|
|
413
|
+
"""
|
|
414
|
+
all_start_chars = [d[0] for d in delimiters]
|
|
415
|
+
all_length = [len(d) for d in delimiters]
|
|
416
|
+
|
|
417
|
+
text_list = []
|
|
418
|
+
last_words = ''
|
|
419
|
+
|
|
420
|
+
while len(text) > 0:
|
|
421
|
+
for char_idx, char in enumerate(text):
|
|
422
|
+
match_index = [
|
|
423
|
+
idx for idx, start_char in enumerate(all_start_chars)
|
|
424
|
+
if start_char == char
|
|
425
|
+
]
|
|
426
|
+
is_delimiter = False
|
|
427
|
+
for index in match_index:
|
|
428
|
+
if text[char_idx:char_idx
|
|
429
|
+
+ all_length[index]] == delimiters[index]:
|
|
430
|
+
if last_words:
|
|
431
|
+
if text_list:
|
|
432
|
+
text_list[-1]['content'] = last_words
|
|
433
|
+
else:
|
|
434
|
+
text_list.append({
|
|
435
|
+
'key': '',
|
|
436
|
+
'content': last_words
|
|
437
|
+
})
|
|
438
|
+
last_words = ''
|
|
439
|
+
text_list.append({'key': delimiters[index]})
|
|
440
|
+
text = text[char_idx + all_length[index]:]
|
|
441
|
+
is_delimiter = True
|
|
442
|
+
break
|
|
443
|
+
if not is_delimiter:
|
|
444
|
+
last_words += char
|
|
445
|
+
else:
|
|
446
|
+
break
|
|
447
|
+
if last_words == text:
|
|
448
|
+
text = ''
|
|
449
|
+
|
|
450
|
+
text_list[-1]['content'] = last_words
|
|
451
|
+
return text_list
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def calculate_loss_scale(response: str,
|
|
455
|
+
use_loss_scale=False
|
|
456
|
+
) -> Tuple[List[str], List[float]]:
|
|
457
|
+
"""Calculate the loss scale by splitting the agent response.
|
|
458
|
+
This algorithm comes from paper: https://arxiv.org/pdf/2309.00986.pdf
|
|
459
|
+
Agent response format:
|
|
460
|
+
```text
|
|
461
|
+
Thought: you should always think about what to do
|
|
462
|
+
Action: the action to take, should be one of the above tools[fire_recognition,
|
|
463
|
+
fire_alert, call_police, call_fireman]
|
|
464
|
+
Action Input: the input to the action
|
|
465
|
+
Observation: the result of the action
|
|
466
|
+
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
|
|
467
|
+
Thought: I now know the final answer
|
|
468
|
+
Final Answer: the final answer to the original input question
|
|
469
|
+
```
|
|
470
|
+
Args:
|
|
471
|
+
response: The response text
|
|
472
|
+
use_loss_scale: Use weighted loss. With this, some part of the loss will be enhanced to improve performance.
|
|
473
|
+
Returns:
|
|
474
|
+
A tuple of agent response parts and their weights.
|
|
475
|
+
"""
|
|
476
|
+
if 'Action:' in response and 'Observation:' in response and use_loss_scale:
|
|
477
|
+
agent_keyword = [
|
|
478
|
+
'Action:', 'Action Input:', 'Thought:', 'Final Answer:',
|
|
479
|
+
'Observation:'
|
|
480
|
+
]
|
|
481
|
+
agent_parts = split_str_parts_by(response, agent_keyword)
|
|
482
|
+
weights = []
|
|
483
|
+
agent_content = []
|
|
484
|
+
for c in agent_parts:
|
|
485
|
+
if c['key'] in ('Action:', 'Action Input:'):
|
|
486
|
+
weights += [2.0]
|
|
487
|
+
weights += [2.0]
|
|
488
|
+
elif c['key'] in ('Thought:', 'Final Answer:', ''):
|
|
489
|
+
weights += [1.0]
|
|
490
|
+
weights += [1.0]
|
|
491
|
+
elif c['key'] in ('Observation:', ):
|
|
492
|
+
weights += [2.0]
|
|
493
|
+
weights += [0.0]
|
|
494
|
+
agent_content.append(c['key'])
|
|
495
|
+
agent_content.append(c['content'])
|
|
496
|
+
return agent_content, weights
|
|
497
|
+
else:
|
|
498
|
+
return [response], [1.0]
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def get_bucket_sizes(max_length: int) -> List[int]:
|
|
502
|
+
return [max_length // 4 * (i + 1) for i in range(4)]
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _get_closet_bucket(bucket_sizes, data_length):
|
|
506
|
+
"""Select the one from bucket_sizes that is closest in distance to
|
|
507
|
+
data_length. This is required for TorchAcc.
|
|
508
|
+
"""
|
|
509
|
+
cloest_length = sys.maxsize
|
|
510
|
+
for b in bucket_sizes:
|
|
511
|
+
if b == data_length or ((b < cloest_length) and (b > data_length)):
|
|
512
|
+
cloest_length = b
|
|
513
|
+
|
|
514
|
+
if cloest_length == sys.maxsize:
|
|
515
|
+
bucket_sizes.append(data_length)
|
|
516
|
+
cloest_length = data_length
|
|
517
|
+
|
|
518
|
+
return cloest_length
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def pad_and_split_batch(padding_to, input_ids, attention_mask, labels,
|
|
522
|
+
loss_scale, max_length, tokenizer, rank, world_size):
|
|
523
|
+
if padding_to is None:
|
|
524
|
+
longest_len = input_ids.shape[-1]
|
|
525
|
+
bucket_sizes = get_bucket_sizes(max_length)
|
|
526
|
+
bucket_data_length = _get_closet_bucket(bucket_sizes, longest_len)
|
|
527
|
+
padding_length = bucket_data_length - input_ids.shape[1]
|
|
528
|
+
input_ids = F.pad(input_ids, (0, padding_length), 'constant',
|
|
529
|
+
tokenizer.pad_token_id)
|
|
530
|
+
attention_mask = F.pad(attention_mask, (0, padding_length), 'constant',
|
|
531
|
+
0)
|
|
532
|
+
if loss_scale:
|
|
533
|
+
loss_scale = F.pad(loss_scale, (0, padding_length), 'constant', 0.)
|
|
534
|
+
labels = F.pad(labels, (0, padding_length), 'constant', -100)
|
|
535
|
+
|
|
536
|
+
# manully split the batch to different DP rank.
|
|
537
|
+
batch_size = input_ids.shape[0] // world_size
|
|
538
|
+
if batch_size > 0:
|
|
539
|
+
start = rank * batch_size
|
|
540
|
+
end = (rank + 1) * batch_size
|
|
541
|
+
input_ids = input_ids[start:end, :]
|
|
542
|
+
attention_mask = attention_mask[start:end, :]
|
|
543
|
+
labels = labels[start:end, :]
|
|
544
|
+
if loss_scale:
|
|
545
|
+
loss_scale = loss_scale[start:end, :]
|
|
546
|
+
return input_ids, attention_mask, labels, loss_scale
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def get_dist_setting() -> Tuple[int, int, int, int]:
|
|
550
|
+
"""return rank, local_rank, world_size, local_world_size"""
|
|
551
|
+
rank = int(os.getenv('RANK', -1))
|
|
552
|
+
local_rank = int(os.getenv('LOCAL_RANK', -1))
|
|
553
|
+
world_size = int(os.getenv('WORLD_SIZE', 1))
|
|
554
|
+
local_world_size = int(os.getenv('LOCAL_WORLD_SIZE', 1))
|
|
555
|
+
return rank, local_rank, world_size, local_world_size
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def use_torchacc() -> bool:
|
|
559
|
+
return os.getenv('USE_TORCHACC', '0') == '1'
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def is_module_installed(module_name):
|
|
563
|
+
try:
|
|
564
|
+
importlib.import_module(module_name)
|
|
565
|
+
return True
|
|
566
|
+
except ImportError:
|
|
567
|
+
return False
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def get_module_path(module_name):
|
|
571
|
+
spec = importlib.util.find_spec(module_name)
|
|
572
|
+
if spec and spec.origin:
|
|
573
|
+
return os.path.abspath(spec.origin)
|
|
574
|
+
else:
|
|
575
|
+
raise ValueError(f'Cannot find module: {module_name}')
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def get_valid_list(input_list, candidate_list):
|
|
579
|
+
"""
|
|
580
|
+
Get the valid and invalid list from input_list based on candidate_list.
|
|
581
|
+
Args:
|
|
582
|
+
input_list: The input list.
|
|
583
|
+
candidate_list: The candidate list.
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
valid_list: The valid list.
|
|
587
|
+
invalid_list: The invalid list.
|
|
588
|
+
"""
|
|
589
|
+
return [i for i in input_list if i in candidate_list], \
|
|
590
|
+
[i for i in input_list if i not in candidate_list]
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def get_latest_folder_path(work_dir):
|
|
594
|
+
from datetime import datetime
|
|
595
|
+
# Get all subdirectories in the work_dir
|
|
596
|
+
folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
|
|
597
|
+
|
|
598
|
+
# Get the timestamp(YYYYMMDD_HHMMSS)
|
|
599
|
+
timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
|
|
600
|
+
|
|
601
|
+
# Filter out the folders
|
|
602
|
+
timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
|
|
603
|
+
|
|
604
|
+
if not timestamped_folders:
|
|
605
|
+
print(f'>> No timestamped folders found in {work_dir}!')
|
|
606
|
+
return None
|
|
607
|
+
|
|
608
|
+
# timestamp parser
|
|
609
|
+
def parse_timestamp(folder_name):
|
|
610
|
+
return datetime.strptime(folder_name, "%Y%m%d_%H%M%S")
|
|
611
|
+
|
|
612
|
+
# Find the latest folder
|
|
613
|
+
latest_folder = max(timestamped_folders, key=parse_timestamp)
|
|
614
|
+
|
|
615
|
+
return os.path.join(work_dir, latest_folder)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def csv_to_list(file_path: str) -> List[dict]:
|
|
619
|
+
import csv
|
|
620
|
+
|
|
621
|
+
with open(file_path, mode='r', newline='', encoding='utf-8') as csv_file:
|
|
622
|
+
csv_reader = csv.DictReader(csv_file)
|
|
623
|
+
result = [row for row in csv_reader]
|
|
624
|
+
|
|
625
|
+
return result
|
evalscope/version.py
ADDED