evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,625 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright (c) OpenCompass.
3
+
4
+ import functools
5
+ import importlib
6
+ import importlib.util
7
+ import os
8
+ import re
9
+ import json
10
+ import random
11
+ import sys
12
+ from typing import Any, Union, Dict, Tuple, List
13
+ import hashlib
14
+ import torch.nn.functional as F
15
+
16
+ import jsonlines as jsonl
17
+ import yaml
18
+
19
+ from evalscope.constants import DumpMode, OutputsStructure
20
+ from evalscope.utils.logger import get_logger
21
+
22
+ logger = get_logger()
23
+
24
+ TEST_LEVEL_LIST = [0, 1]
25
+
26
+ # Example: export TEST_LEVEL_LIST=0,1
27
+ TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
28
+
29
+
30
+ def test_level_list():
31
+ global TEST_LEVEL_LIST
32
+ if TEST_LEVEL_LIST_STR in os.environ:
33
+ TEST_LEVEL_LIST = [
34
+ int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')
35
+ ]
36
+
37
+ return TEST_LEVEL_LIST
38
+
39
+
40
+ def jsonl_to_list(jsonl_file):
41
+ """
42
+ Read jsonl file to list.
43
+
44
+ Args:
45
+ jsonl_file: jsonl file path.
46
+
47
+ Returns:
48
+ list: list of lines. Each line is a dict.
49
+ """
50
+ res_list = []
51
+ with jsonl.open(jsonl_file, mode='r') as reader:
52
+ for line in reader.iter(
53
+ type=dict, allow_none=True, skip_invalid=False):
54
+ res_list.append(line)
55
+ return res_list
56
+
57
+
58
+ def jsonl_to_reader(jsonl_file):
59
+ """
60
+ Read jsonl file to reader object.
61
+
62
+ Args:
63
+ jsonl_file: jsonl file path.
64
+
65
+ Returns:
66
+ reader: jsonl reader object.
67
+ """
68
+ with jsonl.open(jsonl_file, mode='r') as reader:
69
+ return reader
70
+
71
+
72
+ def jsonl_to_csv():
73
+ pass
74
+
75
+
76
+ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
77
+ """
78
+ Dump data to jsonl file.
79
+
80
+ Args:
81
+ data_list: data list to be dumped. [{'a': 'aaa'}, ...]
82
+ jsonl_file: jsonl file path.
83
+ dump_mode: dump mode. It can be 'overwrite' or 'append'.
84
+ """
85
+ if not jsonl_file:
86
+ raise ValueError('output file must be provided.')
87
+
88
+ jsonl_file = os.path.expanduser(jsonl_file)
89
+
90
+ if dump_mode == DumpMode.OVERWRITE:
91
+ dump_mode = 'w'
92
+ elif dump_mode == DumpMode.APPEND:
93
+ dump_mode = 'a'
94
+ with jsonl.open(jsonl_file, mode=dump_mode) as writer:
95
+ writer.write_all(data_list)
96
+ logger.info(f'Dump data to {jsonl_file} successfully.')
97
+
98
+
99
+ def yaml_to_dict(yaml_file) -> dict:
100
+ """
101
+ Read yaml file to dict.
102
+ """
103
+ with open(yaml_file, 'r') as f:
104
+ try:
105
+ stream = yaml.safe_load(f)
106
+ except yaml.YAMLError as e:
107
+ logger.error(f'{e}')
108
+ raise e
109
+
110
+ return stream
111
+
112
+
113
+ def dict_to_yaml(d: dict, yaml_file: str):
114
+ """
115
+ Dump dict to yaml file.
116
+ """
117
+ with open(yaml_file, 'w') as f:
118
+ yaml.dump(d, f, default_flow_style=False)
119
+ logger.info(f'Dump data to {yaml_file} successfully.')
120
+
121
+
122
+ def json_to_dict(json_file) -> dict:
123
+ """
124
+ Read json file to dict.
125
+ """
126
+ with open(json_file, 'r') as f:
127
+ try:
128
+ stream = json.load(f)
129
+ except json.JSONDecodeError as e:
130
+ logger.error(f'{e}')
131
+ raise e
132
+
133
+ return stream
134
+
135
+
136
+ def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
137
+ module_name, spliter, cls_name = eval_class_ref.partition(':')
138
+
139
+ try:
140
+ obj_cls = importlib.import_module(module_name)
141
+ except ImportError as e:
142
+ logger.error(f'{e}')
143
+ raise e
144
+
145
+ if spliter:
146
+ for attr in cls_name.split('.'):
147
+ obj_cls = getattr(obj_cls, attr)
148
+
149
+ return functools.partial(obj_cls, *args, **kwargs)
150
+
151
+
152
+ def markdown_table(header_l, data_l):
153
+ md_str = f'| {" | ".join(header_l)} |'
154
+ md_str += f'\n| {" | ".join(["---"] * len(header_l))} |'
155
+ for data in data_l:
156
+ if isinstance(data, str):
157
+ data = [data]
158
+ assert len(data) <= len(header_l)
159
+ tmp = data + [''] * (len(header_l) - len(data))
160
+ md_str += f'\n| {" | ".join(tmp)} |'
161
+ return md_str
162
+
163
+
164
+ def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
165
+ """Random choice with a (potentially string) seed."""
166
+ return random.Random(seed).choices(choices, k=1, **kwargs)[0]
167
+
168
+
169
+ def gen_hash(name: str):
170
+ return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()
171
+
172
+
173
+ def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
174
+ """
175
+ Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
176
+ converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
177
+ string, which can then be stored in the json format.
178
+
179
+ Refer to: https://github.com/huggingface/transformers/pull/16065/files for details.
180
+ """
181
+ if d.get('torch_dtype', None) is not None and not isinstance(d['torch_dtype'], str):
182
+ d['torch_dtype'] = str(d['torch_dtype']).split('.')[1]
183
+
184
+ for value in d.values():
185
+ if isinstance(value, dict):
186
+ dict_torch_dtype_to_str(value)
187
+
188
+ return d
189
+
190
+
191
+ class ResponseParser:
192
+
193
+ @staticmethod
194
+ def parse_first_capital(text: str) -> str:
195
+ for t in text:
196
+ if t.isupper():
197
+ return t
198
+ return ''
199
+
200
+ @staticmethod
201
+ def parse_last_capital(text: str) -> str:
202
+ for t in text[::-1]:
203
+ if t.isupper():
204
+ return t
205
+ return ''
206
+
207
+ @staticmethod
208
+ def parse_first_option_with_choices(text: str, options: list) -> str:
209
+ """
210
+ Find first valid option for text.
211
+
212
+ Args:
213
+ text: The text to parse.
214
+ options: The options to find. e.g. ['A', 'B', 'C', 'D']
215
+ """
216
+ options_concat = '|'.join([str(i) for i in options])
217
+
218
+ patterns = [
219
+ f'答案是?\s?([{options_concat}])',
220
+ f'答案是?\s?:([{options_concat}])',
221
+ f'答案是?\s?:([{options_concat}])',
222
+ f'答案应该?是\s?([{options_concat}])',
223
+ f'答案应该?选\s?([{options_concat}])',
224
+ f'答案为\s?([{options_concat}])',
225
+ f'答案选\s?([{options_concat}])',
226
+ f'选择?\s?([{options_concat}])',
227
+ f'故选?\s?([{options_concat}])'
228
+ f'只有选?项?\s?([{options_concat}])\s?是?对',
229
+ f'只有选?项?\s?([{options_concat}])\s?是?错',
230
+ f'只有选?项?\s?([{options_concat}])\s?不?正确',
231
+ f'只有选?项?\s?([{options_concat}])\s?错误',
232
+ f'说法不?对选?项?的?是\s?([{options_concat}])',
233
+ f'说法不?正确选?项?的?是\s?([{options_concat}])',
234
+ f'说法错误选?项?的?是\s?([{options_concat}])',
235
+ f'([{options_concat}])\s?是正确的',
236
+ f'([{options_concat}])\s?是正确答案',
237
+ f'选项\s?([{options_concat}])\s?正确',
238
+ f'所以答\s?([{options_concat}])',
239
+ f'1.\s?([{options_concat}])[.。$]?$',
240
+ f'所以\s?([{options_concat}][.。$]?$)',
241
+ f'所有\s?([{options_concat}][.。$]?$)',
242
+ f'[\s,::,]([{options_concat}])[。,,\.]?$',
243
+ f'[\s,,::][故即]([{options_concat}])[。\.]?$',
244
+ f'[\s,,::]因此([{options_concat}])[。\.]?$',
245
+ f'[是为。]\s?([{options_concat}])[。\.]?$',
246
+ f'因此\s?([{options_concat}])[。\.]?$',
247
+ f'显然\s?([{options_concat}])[。\.]?$',
248
+ f'答案是\s?(\S+)(?:。|$)',
249
+ f'答案应该是\s?(\S+)(?:。|$)',
250
+ f'答案为\s?(\S+)(?:。|$)',
251
+ f'答案是(.*?)[{options_concat}]',
252
+ f'答案为(.*?)[{options_concat}]',
253
+ f'固选(.*?)[{options_concat}]',
254
+ f'答案应该是(.*?)[{options_concat}]',
255
+ f'[Tt]he answer is [{options_concat}]',
256
+ f'[Tt]he correct answer is [{options_concat}]',
257
+ f'[Tt]he correct answer is:\n[{options_concat}]',
258
+ f'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
259
+ f'[{options_concat}]',
260
+ f'^选项\s?([{options_concat}])',
261
+ f'^([{options_concat}])\s?选?项',
262
+ f'(\s|^)[{options_concat}][\s。,,::\.$]',
263
+ f'(\s|^)[{options_concat}](\s|$)',
264
+ f'1.\s?(.*?)$',
265
+ ]
266
+
267
+ regexes = [re.compile(pattern) for pattern in patterns]
268
+ for regex in regexes:
269
+ match = regex.search(text)
270
+ if match:
271
+ outputs = match.group(0)
272
+ for i in options:
273
+ if i in outputs:
274
+ return i
275
+ return ''
276
+
277
+ @staticmethod
278
+ def parse_first_option(text: str) -> str:
279
+ """
280
+ Find first valid option for text.
281
+
282
+ Args:
283
+ text: The text to parse.
284
+ """
285
+ patterns = [
286
+ r"[Aa]nswer:\s*(\w+)",
287
+ r"[Tt]he correct answer is:\s*(\w+)",
288
+ r"[Tt]he correct answer is:\n\s*(\w+)",
289
+ r"[Tt]he correct answer is:\n\n-\s*(\w+)",
290
+ r"[Tt]he answer might be:\n\n-\s*(\w+)",
291
+ r"[Tt]he answer is \s*(\w+)",
292
+ ]
293
+
294
+ regexes = [re.compile(pattern) for pattern in patterns]
295
+ for regex in regexes:
296
+ match = regex.search(text)
297
+ if match:
298
+ return match.group(1)
299
+ return ''
300
+
301
+ @staticmethod
302
+ def parse_first_capital_multi(text: str) -> str:
303
+ match = re.search(r'([A-D]+)', text)
304
+ if match:
305
+ return match.group(1)
306
+ return ''
307
+
308
+ @staticmethod
309
+ def parse_last_option(text: str, options: str) -> str:
310
+ match = re.findall(rf'([{options}])', text)
311
+ if match:
312
+ return match[-1]
313
+ return ''
314
+
315
+
316
+ def make_outputs_dir(root_dir: str, datasets: list, model_id: str, model_revision: str):
317
+ # model_revision = model_revision if model_revision is not None else 'none'
318
+ # now = datetime.datetime.now()
319
+ # format_time = now.strftime('%Y%m%d_%H%M%S')
320
+ # outputs_name = format_time + '_' + 'default' + '_' + model_id.replace('/', '_') + '_' + model_revision
321
+ # outputs_dir = os.path.join(work_dir, outputs_name)
322
+ # dataset_name = dataset_id.replace('/', '_')
323
+ # outputs_dir = os.path.join(work_dir, dataset_name)
324
+
325
+ if not model_id:
326
+ model_id = 'default'
327
+ model_id = model_id.replace('/', '_')
328
+
329
+ if not model_revision:
330
+ model_revision = 'default'
331
+
332
+ outputs_dir = os.path.join(root_dir,
333
+ f"eval_{'-'.join(datasets)}_{model_id}_{model_revision}")
334
+
335
+ return outputs_dir
336
+
337
+
338
+ def process_outputs_structure(outputs_dir: str, is_make: bool = True) -> dict:
339
+ logs_dir = os.path.join(outputs_dir, 'logs')
340
+ predictions_dir = os.path.join(outputs_dir, 'predictions')
341
+ reviews_dir = os.path.join(outputs_dir, 'reviews')
342
+ reports_dir = os.path.join(outputs_dir, 'reports')
343
+ configs_dir = os.path.join(outputs_dir, 'configs')
344
+
345
+ if is_make:
346
+ os.makedirs(outputs_dir, exist_ok=True)
347
+ os.makedirs(logs_dir, exist_ok=True)
348
+ os.makedirs(predictions_dir, exist_ok=True)
349
+ os.makedirs(reviews_dir, exist_ok=True)
350
+ os.makedirs(reports_dir, exist_ok=True)
351
+ os.makedirs(configs_dir, exist_ok=True)
352
+
353
+ outputs_structure = {
354
+ OutputsStructure.LOGS_DIR: logs_dir,
355
+ OutputsStructure.PREDICTIONS_DIR: predictions_dir,
356
+ OutputsStructure.REVIEWS_DIR: reviews_dir,
357
+ OutputsStructure.REPORTS_DIR: reports_dir,
358
+ OutputsStructure.CONFIGS_DIR: configs_dir,
359
+ }
360
+
361
+ return outputs_structure
362
+
363
+
364
+ def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
365
+ """
366
+ Import module utility function.
367
+
368
+ Args:
369
+ import_path_prefix: e.g. 'evalscope.benchmarks.'
370
+ module_name: The module name to import. e.g. 'mmlu'
371
+ members_to_import: The members to import.
372
+ e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass']
373
+
374
+ Returns:
375
+ dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...}
376
+ """
377
+ imported_modules = {}
378
+ module = importlib.import_module(import_path_prefix + module_name)
379
+ for member_name in members_to_import:
380
+ imported_modules[member_name] = getattr(module, member_name)
381
+
382
+ return imported_modules
383
+
384
+
385
+ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
386
+ """
387
+ Normalize score.
388
+
389
+ Args:
390
+ score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
391
+ keep_num: number of digits to keep.
392
+
393
+ Returns:
394
+ Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
395
+ """
396
+ if isinstance(score, float):
397
+ score = round(score, keep_num)
398
+ elif isinstance(score, dict):
399
+ score = {k: round(v, keep_num) for k, v in score.items()}
400
+ else:
401
+ logger.warning(f'Unknown score type: {type(score)}')
402
+
403
+ return score
404
+
405
+
406
+ def split_str_parts_by(text: str, delimiters: List[str]):
407
+ """Split the text field into parts.
408
+ Args:
409
+ text: A text to be split.
410
+ delimiters: The delimiters.
411
+ Returns:
412
+ The split text in list of dicts.
413
+ """
414
+ all_start_chars = [d[0] for d in delimiters]
415
+ all_length = [len(d) for d in delimiters]
416
+
417
+ text_list = []
418
+ last_words = ''
419
+
420
+ while len(text) > 0:
421
+ for char_idx, char in enumerate(text):
422
+ match_index = [
423
+ idx for idx, start_char in enumerate(all_start_chars)
424
+ if start_char == char
425
+ ]
426
+ is_delimiter = False
427
+ for index in match_index:
428
+ if text[char_idx:char_idx
429
+ + all_length[index]] == delimiters[index]:
430
+ if last_words:
431
+ if text_list:
432
+ text_list[-1]['content'] = last_words
433
+ else:
434
+ text_list.append({
435
+ 'key': '',
436
+ 'content': last_words
437
+ })
438
+ last_words = ''
439
+ text_list.append({'key': delimiters[index]})
440
+ text = text[char_idx + all_length[index]:]
441
+ is_delimiter = True
442
+ break
443
+ if not is_delimiter:
444
+ last_words += char
445
+ else:
446
+ break
447
+ if last_words == text:
448
+ text = ''
449
+
450
+ text_list[-1]['content'] = last_words
451
+ return text_list
452
+
453
+
454
+ def calculate_loss_scale(response: str,
455
+ use_loss_scale=False
456
+ ) -> Tuple[List[str], List[float]]:
457
+ """Calculate the loss scale by splitting the agent response.
458
+ This algorithm comes from paper: https://arxiv.org/pdf/2309.00986.pdf
459
+ Agent response format:
460
+ ```text
461
+ Thought: you should always think about what to do
462
+ Action: the action to take, should be one of the above tools[fire_recognition,
463
+ fire_alert, call_police, call_fireman]
464
+ Action Input: the input to the action
465
+ Observation: the result of the action
466
+ ... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
467
+ Thought: I now know the final answer
468
+ Final Answer: the final answer to the original input question
469
+ ```
470
+ Args:
471
+ response: The response text
472
+ use_loss_scale: Use weighted loss. With this, some part of the loss will be enhanced to improve performance.
473
+ Returns:
474
+ A tuple of agent response parts and their weights.
475
+ """
476
+ if 'Action:' in response and 'Observation:' in response and use_loss_scale:
477
+ agent_keyword = [
478
+ 'Action:', 'Action Input:', 'Thought:', 'Final Answer:',
479
+ 'Observation:'
480
+ ]
481
+ agent_parts = split_str_parts_by(response, agent_keyword)
482
+ weights = []
483
+ agent_content = []
484
+ for c in agent_parts:
485
+ if c['key'] in ('Action:', 'Action Input:'):
486
+ weights += [2.0]
487
+ weights += [2.0]
488
+ elif c['key'] in ('Thought:', 'Final Answer:', ''):
489
+ weights += [1.0]
490
+ weights += [1.0]
491
+ elif c['key'] in ('Observation:', ):
492
+ weights += [2.0]
493
+ weights += [0.0]
494
+ agent_content.append(c['key'])
495
+ agent_content.append(c['content'])
496
+ return agent_content, weights
497
+ else:
498
+ return [response], [1.0]
499
+
500
+
501
+ def get_bucket_sizes(max_length: int) -> List[int]:
502
+ return [max_length // 4 * (i + 1) for i in range(4)]
503
+
504
+
505
+ def _get_closet_bucket(bucket_sizes, data_length):
506
+ """Select the one from bucket_sizes that is closest in distance to
507
+ data_length. This is required for TorchAcc.
508
+ """
509
+ cloest_length = sys.maxsize
510
+ for b in bucket_sizes:
511
+ if b == data_length or ((b < cloest_length) and (b > data_length)):
512
+ cloest_length = b
513
+
514
+ if cloest_length == sys.maxsize:
515
+ bucket_sizes.append(data_length)
516
+ cloest_length = data_length
517
+
518
+ return cloest_length
519
+
520
+
521
+ def pad_and_split_batch(padding_to, input_ids, attention_mask, labels,
522
+ loss_scale, max_length, tokenizer, rank, world_size):
523
+ if padding_to is None:
524
+ longest_len = input_ids.shape[-1]
525
+ bucket_sizes = get_bucket_sizes(max_length)
526
+ bucket_data_length = _get_closet_bucket(bucket_sizes, longest_len)
527
+ padding_length = bucket_data_length - input_ids.shape[1]
528
+ input_ids = F.pad(input_ids, (0, padding_length), 'constant',
529
+ tokenizer.pad_token_id)
530
+ attention_mask = F.pad(attention_mask, (0, padding_length), 'constant',
531
+ 0)
532
+ if loss_scale:
533
+ loss_scale = F.pad(loss_scale, (0, padding_length), 'constant', 0.)
534
+ labels = F.pad(labels, (0, padding_length), 'constant', -100)
535
+
536
+ # manully split the batch to different DP rank.
537
+ batch_size = input_ids.shape[0] // world_size
538
+ if batch_size > 0:
539
+ start = rank * batch_size
540
+ end = (rank + 1) * batch_size
541
+ input_ids = input_ids[start:end, :]
542
+ attention_mask = attention_mask[start:end, :]
543
+ labels = labels[start:end, :]
544
+ if loss_scale:
545
+ loss_scale = loss_scale[start:end, :]
546
+ return input_ids, attention_mask, labels, loss_scale
547
+
548
+
549
+ def get_dist_setting() -> Tuple[int, int, int, int]:
550
+ """return rank, local_rank, world_size, local_world_size"""
551
+ rank = int(os.getenv('RANK', -1))
552
+ local_rank = int(os.getenv('LOCAL_RANK', -1))
553
+ world_size = int(os.getenv('WORLD_SIZE', 1))
554
+ local_world_size = int(os.getenv('LOCAL_WORLD_SIZE', 1))
555
+ return rank, local_rank, world_size, local_world_size
556
+
557
+
558
+ def use_torchacc() -> bool:
559
+ return os.getenv('USE_TORCHACC', '0') == '1'
560
+
561
+
562
+ def is_module_installed(module_name):
563
+ try:
564
+ importlib.import_module(module_name)
565
+ return True
566
+ except ImportError:
567
+ return False
568
+
569
+
570
+ def get_module_path(module_name):
571
+ spec = importlib.util.find_spec(module_name)
572
+ if spec and spec.origin:
573
+ return os.path.abspath(spec.origin)
574
+ else:
575
+ raise ValueError(f'Cannot find module: {module_name}')
576
+
577
+
578
+ def get_valid_list(input_list, candidate_list):
579
+ """
580
+ Get the valid and invalid list from input_list based on candidate_list.
581
+ Args:
582
+ input_list: The input list.
583
+ candidate_list: The candidate list.
584
+
585
+ Returns:
586
+ valid_list: The valid list.
587
+ invalid_list: The invalid list.
588
+ """
589
+ return [i for i in input_list if i in candidate_list], \
590
+ [i for i in input_list if i not in candidate_list]
591
+
592
+
593
+ def get_latest_folder_path(work_dir):
594
+ from datetime import datetime
595
+ # Get all subdirectories in the work_dir
596
+ folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
597
+
598
+ # Get the timestamp(YYYYMMDD_HHMMSS)
599
+ timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
600
+
601
+ # Filter out the folders
602
+ timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
603
+
604
+ if not timestamped_folders:
605
+ print(f'>> No timestamped folders found in {work_dir}!')
606
+ return None
607
+
608
+ # timestamp parser
609
+ def parse_timestamp(folder_name):
610
+ return datetime.strptime(folder_name, "%Y%m%d_%H%M%S")
611
+
612
+ # Find the latest folder
613
+ latest_folder = max(timestamped_folders, key=parse_timestamp)
614
+
615
+ return os.path.join(work_dir, latest_folder)
616
+
617
+
618
+ def csv_to_list(file_path: str) -> List[dict]:
619
+ import csv
620
+
621
+ with open(file_path, mode='r', newline='', encoding='utf-8') as csv_file:
622
+ csv_reader = csv.DictReader(csv_file)
623
+ result = [row for row in csv_reader]
624
+
625
+ return result
evalscope/version.py ADDED
@@ -0,0 +1,4 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ __version__ = '0.5.0'
4
+ __release_datetime__ = '2024-08-01 08:00:00'