evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,27 @@
1
+ model_args: # model args should be followed by benchmark requirements
2
+ revision: default
3
+ precision: torch.float16
4
+ device_map: auto
5
+ # model_name_or_path: qwen/qwen-7b-chat
6
+ generation_config:
7
+ temperature: 0.3
8
+ max_length: 2048
9
+ max_new_tokens: 512
10
+ top_k: 50
11
+ top_p: 0.85
12
+ do_sample: true
13
+ num_beams: 1
14
+ repetition_penalty: 1.0
15
+ # eos_token_id: null
16
+ # pad_token_id: null
17
+ dataset_args: {}
18
+ dry_run: false
19
+ model: null # Note: to be implemented as CustomModel
20
+ eval_type: custom
21
+ datasets:
22
+ - general_qa
23
+ outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
+ use_cache: false
25
+ stage: all
26
+ dataset_hub: Local # `Local` or `ModelScope`
27
+ limit: null
@@ -0,0 +1,29 @@
1
+ model_args: # model args should be followed by benchmark requirements
2
+ revision: default
3
+ precision: torch.float16
4
+ device_map: auto
5
+ # model_name_or_path: qwen/qwen-7b-chat
6
+ generation_config:
7
+ temperature: 0.3
8
+ max_length: 2048
9
+ max_new_tokens: 512
10
+ top_k: 50
11
+ top_p: 0.85
12
+ do_sample: false
13
+ num_beams: 1
14
+ repetition_penalty: 1.0
15
+ # eos_token_id: null
16
+ # pad_token_id: null
17
+ dataset_args:
18
+ gsm8k:
19
+ few_shot_num: 0
20
+ dry_run: false
21
+ model: null # Note: to be implemented as CustomModel
22
+ eval_type: custom
23
+ datasets:
24
+ - gsm8k
25
+ outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
26
+ use_cache: false
27
+ stage: all
28
+ dataset_hub: ModelScope # `Local` or `ModelScope`
29
+ limit: null
@@ -0,0 +1,29 @@
1
+ model_args: # model args should be followed by benchmark requirements
2
+ revision: default
3
+ precision: torch.float16
4
+ device_map: auto
5
+ # model_name_or_path: qwen/qwen-7b-chat
6
+ generation_config:
7
+ temperature: 0.3
8
+ max_length: 2048
9
+ max_new_tokens: 512
10
+ top_k: 50
11
+ top_p: 0.85
12
+ do_sample: false
13
+ num_beams: 1
14
+ repetition_penalty: 1.0
15
+ # eos_token_id: null
16
+ # pad_token_id: null
17
+ dataset_args:
18
+ mmlu:
19
+ few_shot_num: 0
20
+ dry_run: false
21
+ model: null # Note: to be implemented as CustomModel
22
+ eval_type: custom
23
+ datasets:
24
+ - mmlu
25
+ outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
26
+ use_cache: true
27
+ stage: all
28
+ dataset_hub: ModelScope # `Local` or `ModelScope`
29
+ limit: null
@@ -0,0 +1,27 @@
1
+ model_args: # model args should be followed by benchmark requirements
2
+ revision: default
3
+ precision: torch.float16
4
+ device_map: auto
5
+ # model_name_or_path: qwen/qwen-7b-chat
6
+ generation_config:
7
+ temperature: 0.3
8
+ max_length: 2048
9
+ max_new_tokens: 512
10
+ top_k: 50
11
+ top_p: 0.85
12
+ do_sample: false
13
+ num_beams: 1
14
+ repetition_penalty: 1.0
15
+ # eos_token_id: null
16
+ # pad_token_id: null
17
+ dataset_args: {'mmlu': {'subset_list': ['high_school_european_history', 'business_ethics']}}
18
+ dry_run: false
19
+ model: null # Note: to be implemented as CustomModel
20
+ eval_type: custom
21
+ datasets:
22
+ - mmlu
23
+ outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
+ use_cache: false
25
+ stage: all
26
+ dataset_hub: ModelScope # `Local` or `ModelScope`
27
+ limit: null
evalscope/run.py ADDED
@@ -0,0 +1,404 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa
3
+ import copy
4
+ import json
5
+ import argparse
6
+ import os.path
7
+ from typing import Union, List
8
+ import torch # noqa
9
+
10
+ from evalscope.config import TaskConfig
11
+ from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
12
+ from evalscope.evaluator import Evaluator
13
+ from evalscope.evaluator.evaluator import HumanevalEvaluator
14
+ from evalscope.models.custom import CustomModel
15
+ from evalscope.utils import import_module_util, yaml_to_dict, make_outputs_dir, gen_hash, json_to_dict, EvalBackend
16
+ from evalscope.utils.logger import get_logger
17
+
18
+ logger = get_logger()
19
+
20
+ """
21
+ Run evaluation for LLMs.
22
+ """
23
+
24
+ BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
25
+ MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
26
+
27
+
28
+ def parse_args():
29
+ parser = argparse.ArgumentParser(description='Run evaluation on benchmarks for LLMs.')
30
+
31
+ parser.add_argument('--model',
32
+ help='The model id on modelscope, or local model dir.',
33
+ type=str,
34
+ # required=True,
35
+ required=False,
36
+ )
37
+ parser.add_argument('--model-type',
38
+ help='Deprecated. See `--template-type`',
39
+ type=str,
40
+ required=False,
41
+ default=None)
42
+ parser.add_argument('--template-type',
43
+ type=str,
44
+ help='The template type for generation, should be a string.'
45
+ 'Refer to `https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md` for more details.',
46
+ required=False,
47
+ )
48
+ parser.add_argument('--eval-type',
49
+ type=str,
50
+ help='The type for evaluating. '
51
+ 'service - for APIs, TO-DO'
52
+ 'checkpoint - for models on ModelScope or local model dir, '
53
+ 'custom - for custom models.'
54
+ ' Need to set `--model` to evalscope.models.custom.CustomModel format.'
55
+ 'default to `checkpoint`.',
56
+ required=False,
57
+ default='checkpoint',
58
+ )
59
+ parser.add_argument('--model-args',
60
+ type=str,
61
+ help='The model args, should be a string.',
62
+ required=False,
63
+ default='revision=None,precision=torch.float16,device_map=auto'
64
+ )
65
+ parser.add_argument('--generation-config',
66
+ type=str,
67
+ help='The generation config, should be a string.',
68
+ required=False,
69
+ default='do_sample=False,repetition_penalty=1.0,max_new_tokens=512',
70
+ )
71
+ parser.add_argument('--datasets',
72
+ help='Dataset id list, align to the module name in evalscope.benchmarks',
73
+ type=str,
74
+ nargs='+',
75
+ required=False,
76
+ )
77
+ parser.add_argument('--dataset-args',
78
+ type=json.loads,
79
+ help='The dataset args, should be a json string. The key of dict should be aligned to datasets,'
80
+ 'e.g. {"humaneval": {"local_path": "/to/your/path"}}',
81
+ required=False,
82
+ default='{}')
83
+ parser.add_argument('--dataset-dir',
84
+ help='The datasets dir. Use to specify the local datasets or datasets cache dir.'
85
+ 'See --dataset-hub for more details.',
86
+ required=False,
87
+ default=DEFAULT_ROOT_CACHE_DIR)
88
+ parser.add_argument('--dataset-hub',
89
+ help='The datasets hub, can be `ModelScope` or `HuggingFace` or `Local`. '
90
+ 'Default to `ModelScope`.'
91
+ 'If `Local`, the --dataset-dir should be local input data dir.'
92
+ 'Otherwise, the --dataset-dir should be the cache dir for datasets.',
93
+ required=False,
94
+ default='ModelScope')
95
+ parser.add_argument('--outputs',
96
+ help='Outputs dir. Default to `outputs`, which means dump to current path: ./outputs',
97
+ required=False,
98
+ default='outputs')
99
+ parser.add_argument('--work-dir',
100
+ help='The root cache dir.',
101
+ required=False,
102
+ default=DEFAULT_ROOT_CACHE_DIR)
103
+ parser.add_argument('--limit',
104
+ type=int,
105
+ help='Max evaluation samples num for each subset. Default to None, which means no limit.',
106
+ default=None)
107
+ parser.add_argument('--debug',
108
+ help='Debug mode, will print information for debugging.',
109
+ action='store_true',
110
+ default=False)
111
+ parser.add_argument('--dry-run',
112
+ help='Dry run in single processing mode.',
113
+ action='store_true',
114
+ default=False)
115
+ parser.add_argument('--mem-cache',
116
+ help='To use memory cache or not.',
117
+ action='store_true',
118
+ default=False)
119
+ parser.add_argument('--use-cache',
120
+ help='To reuse the cache or not. Default to `true`.',
121
+ type=str,
122
+ default='false')
123
+ parser.add_argument('--stage',
124
+ help='The stage of evaluation pipeline, '
125
+ 'can be `all`, `infer`, `review`. Default to `all`.',
126
+ type=str,
127
+ default='all')
128
+
129
+ parser.add_argument('--eval-backend',
130
+ help='The evaluation backend to use. Default to None.'
131
+ 'can be `Native`, `OpenCompass` and `ThirdParty`. '
132
+ 'Default to `Native`.',
133
+ type=str,
134
+ default=EvalBackend.NATIVE.value,
135
+ required=False)
136
+
137
+ parser.add_argument('--eval-config',
138
+ help='The eval task config file path for evaluation backend, should be a yaml or json file.',
139
+ type=str,
140
+ default=None,
141
+ required=False)
142
+
143
+ args = parser.parse_args()
144
+
145
+ return args
146
+
147
+
148
+ def parse_str_args(str_args: str) -> dict:
149
+ assert isinstance(str_args, str), 'args should be a string.'
150
+ arg_list: list = str_args.strip().split(',')
151
+ arg_list = [arg.strip() for arg in arg_list]
152
+ arg_dict: dict = dict([arg.split('=') for arg in arg_list])
153
+
154
+ final_args = dict()
155
+ for k, v in arg_dict.items():
156
+ try:
157
+ final_args[k] = eval(v)
158
+ except:
159
+ if v.lower() == 'true':
160
+ v = True
161
+ if v.lower() == 'false':
162
+ v = False
163
+ final_args[k] = v
164
+
165
+ return final_args
166
+
167
+
168
+ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[dict, List[dict]]:
169
+
170
+ if isinstance(task_cfg, list):
171
+ eval_results = []
172
+ for one_task_cfg in task_cfg:
173
+ eval_results.append(run_task(one_task_cfg))
174
+ return eval_results
175
+
176
+ if isinstance(task_cfg, TaskConfig):
177
+ task_cfg = task_cfg.to_dict()
178
+ elif isinstance(task_cfg, str):
179
+ if task_cfg.endswith('.yaml'):
180
+ task_cfg = yaml_to_dict(task_cfg)
181
+ elif task_cfg.endswith('.json'):
182
+ task_cfg = json_to_dict(task_cfg)
183
+ else:
184
+ raise ValueError(f'Unsupported file format: {task_cfg}, should be a yaml or json file.')
185
+ elif isinstance(task_cfg, dict):
186
+ logger.info('** Args: Task config is provided with dictionary type. **')
187
+ else:
188
+ raise ValueError('** Args: Please provide a valid task config. **')
189
+
190
+ # Check and run evaluation backend
191
+ if task_cfg.get('eval_backend') is None:
192
+ task_cfg['eval_backend'] = EvalBackend.NATIVE.value
193
+
194
+ eval_backend = task_cfg.get('eval_backend')
195
+ eval_config: Union[str, dict] = task_cfg.get('eval_config')
196
+
197
+ if eval_backend != EvalBackend.NATIVE.value:
198
+
199
+ if eval_config is None:
200
+ logger.warning(f'Got eval_backend {eval_backend}, but eval_config is not provided.')
201
+
202
+ if eval_backend == EvalBackend.OPEN_COMPASS.value:
203
+ from evalscope.backend.opencompass import OpenCompassBackendManager
204
+ oc_backend_manager = OpenCompassBackendManager(config=eval_config)
205
+ oc_backend_manager.run()
206
+ elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
207
+ from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
208
+ vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
209
+ vlm_eval_kit_backend_manager.run()
210
+ # TODO: Add other evaluation backends
211
+ elif eval_backend == EvalBackend.THIRD_PARTY.value:
212
+ raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
213
+
214
+ return dict()
215
+
216
+ # Get the output task config
217
+ output_task_cfg = copy.copy(task_cfg)
218
+ logger.info(output_task_cfg)
219
+
220
+ model_args: dict = task_cfg.get('model_args',
221
+ {'revision': 'default', 'precision': torch.float16, 'device_map': 'auto'})
222
+ # Get the GLOBAL default config (infer_cfg) for prediction
223
+ generation_config: dict = task_cfg.get('generation_config',
224
+ {'do_sample': False,
225
+ 'repetition_penalty': 1.0,
226
+ 'max_length': 2048,
227
+ 'max_new_tokens': 512,
228
+ 'temperature': 0.3,
229
+ 'top_k': 50,
230
+ 'top_p': 0.8, }
231
+ )
232
+ dataset_args: dict = task_cfg.get('dataset_args', {})
233
+ dry_run: bool = task_cfg.get('dry_run', False)
234
+ model: Union[str, CustomModel] = task_cfg.get('model', None)
235
+ model_type: str = task_cfg.get('model_type', None)
236
+ template_type: str = task_cfg.get('template_type', None)
237
+ eval_type: str = task_cfg.get('eval_type', 'checkpoint')
238
+ datasets: list = task_cfg.get('datasets', None)
239
+ work_dir: str = task_cfg.get('work_dir', DEFAULT_ROOT_CACHE_DIR)
240
+ outputs: str = task_cfg.get('outputs', 'outputs')
241
+ mem_cache: bool = task_cfg.get('mem_cache', False)
242
+ use_cache: bool = task_cfg.get('use_cache', True)
243
+ dataset_hub: str = task_cfg.get('dataset_hub', 'ModelScope')
244
+ dataset_dir: str = task_cfg.get('dataset_dir', DEFAULT_ROOT_CACHE_DIR)
245
+ stage: str = task_cfg.get('stage', 'all')
246
+ limit: int = task_cfg.get('limit', None)
247
+ debug: str = task_cfg.get('debug', False)
248
+
249
+ if model is None or datasets is None:
250
+ if not task_cfg.get('eval_backend'):
251
+ raise ValueError('** Args: Please provide model and datasets. **')
252
+
253
+ if model_type:
254
+ logger.warning('** DeprecatedWarning: `--model-type` is deprecated, please use `--template-type` instead.')
255
+
256
+ model_precision = model_args.get('precision', torch.float16)
257
+ if isinstance(model_precision, str):
258
+ model_precision = eval(model_precision)
259
+
260
+ if mem_cache:
261
+ logger.warning('** DeprecatedWarning: `--mem-cache` is deprecated, please use `--use-cache` instead.')
262
+
263
+ logger.info(f'** Set use_cache to {use_cache}.')
264
+
265
+ # Get model args
266
+ if dry_run:
267
+ from evalscope.models.dummy_chat_model import DummyChatModel
268
+ model_id: str = 'dummy'
269
+ model_revision: str = 'v1.0.0'
270
+ elif eval_type == 'custom':
271
+ model_id: str = None
272
+ model_revision: str = None
273
+ else:
274
+ model_id: str = model
275
+ model_revision: str = model_args.get('revision', 'default')
276
+
277
+ # Get outputs directory
278
+ if isinstance(model_id, str) and os.path.isdir(os.path.expanduser(model_id)):
279
+ # get the output_model_id when model_id is a local model dir
280
+ output_model_id: str = gen_hash(model_id)
281
+ else:
282
+ output_model_id: str = model_id
283
+ if outputs == 'outputs':
284
+ outputs = make_outputs_dir(root_dir=os.path.join(work_dir, 'outputs'),
285
+ datasets=datasets,
286
+ model_id=output_model_id,
287
+ model_revision=model_revision,)
288
+
289
+ eval_results = dict()
290
+ for dataset_name in datasets:
291
+ # Get imported_modules
292
+ imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
293
+
294
+ if dataset_name == 'humaneval' and dataset_args.get('humaneval', {}).get('local_path') is None:
295
+ raise ValueError('Please specify the local problem path of humaneval dataset in --dataset-args,'
296
+ 'e.g. {"humaneval": {"local_path": "/to/your/path"}}, '
297
+ 'And refer to https://github.com/openai/human-eval/tree/master#installation to install it,'
298
+ 'Note that you need to enable the execution code in the human_eval/execution.py first.')
299
+
300
+ if dry_run:
301
+ from evalscope.models.dummy_chat_model import DummyChatModel
302
+ model_adapter = DummyChatModel(model_cfg=dict())
303
+ elif eval_type == 'custom':
304
+ if not isinstance(model, CustomModel):
305
+ raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(model)}.')
306
+ from evalscope.models.model_adapter import CustomModelAdapter
307
+ model_adapter = CustomModelAdapter(custom_model=model)
308
+ else:
309
+ # Init model adapter
310
+ device_map = model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
311
+ model_adapter = imported_modules['ModelAdapterClass'](model_id=model_id,
312
+ model_revision=model_revision,
313
+ device_map=device_map,
314
+ torch_dtype=model_precision,
315
+ cache_dir=work_dir,
316
+ template_type=template_type)
317
+
318
+ if dataset_name == 'humaneval':
319
+ problem_file: str = dataset_args.get('humaneval', {}).get('local_path')
320
+
321
+ evaluator = HumanevalEvaluator(problem_file=problem_file,
322
+ model_id=model_id,
323
+ model_revision=model_revision,
324
+ model_adapter=model_adapter,
325
+ outputs_dir=outputs,
326
+ is_custom_outputs_dir=False, )
327
+ else:
328
+ # TODO: CHECK dataset_args
329
+ dataset_name_or_path: str = dataset_args.get(dataset_name, {}).get('local_path') or imported_modules[
330
+ 'DATASET_ID']
331
+
332
+ in_prompt_template: str = dataset_args.get(dataset_name, {}).get('prompt_template', '')
333
+
334
+ # Init data adapter
335
+ few_shot_num: int = dataset_args.get(dataset_name, {}).get('few_shot_num', None)
336
+ few_shot_random: bool = dataset_args.get(dataset_name, {}).get('few_shot_random', True)
337
+ data_adapter = imported_modules['DataAdapterClass'](few_shot_num=few_shot_num,
338
+ few_shot_random=few_shot_random,
339
+ prompt_template=in_prompt_template,)
340
+
341
+ in_subset_list: list = dataset_args.get(dataset_name, {})\
342
+ .get('subset_list', imported_modules['SUBSET_LIST'])
343
+ logger.info(f'\n** Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
344
+
345
+ evaluator = Evaluator(
346
+ dataset_name_or_path=dataset_name_or_path,
347
+ subset_list=in_subset_list,
348
+ data_adapter=data_adapter,
349
+ model_adapter=model_adapter,
350
+ use_cache=use_cache,
351
+ root_cache_dir=work_dir,
352
+ outputs_dir=outputs,
353
+ is_custom_outputs_dir=outputs != 'outputs',
354
+ datasets_dir=dataset_dir,
355
+ datasets_hub=dataset_hub,
356
+ stage=stage,
357
+ eval_type=eval_type,
358
+ overall_task_cfg=output_task_cfg,
359
+ )
360
+
361
+ infer_cfg = generation_config or {}
362
+ infer_cfg.update(dict(limit=limit))
363
+ res_dict: dict = evaluator.eval(infer_cfg=infer_cfg, debug=debug)
364
+
365
+ eval_results[dataset_name] = res_dict
366
+
367
+ return eval_results
368
+
369
+
370
+ def main():
371
+ args = parse_args()
372
+
373
+ # Get task_cfg
374
+ use_cache: bool = False if args.use_cache.lower() == 'false' else True
375
+ task_cfg = {
376
+ 'model_args': parse_str_args(args.model_args),
377
+ 'generation_config': parse_str_args(args.generation_config),
378
+ 'dataset_args': args.dataset_args,
379
+ 'dry_run': args.dry_run,
380
+ 'model': args.model,
381
+ 'template_type': args.template_type,
382
+ 'eval_type': args.eval_type,
383
+ 'datasets': args.datasets,
384
+ 'work_dir': args.work_dir,
385
+ 'outputs': args.outputs,
386
+ 'mem_cache': args.mem_cache,
387
+ 'use_cache': use_cache,
388
+ 'dataset_hub': args.dataset_hub,
389
+ 'dataset_dir': args.dataset_dir,
390
+ 'stage': args.stage,
391
+ 'limit': args.limit,
392
+ 'debug': args.debug,
393
+
394
+ 'eval_backend': args.eval_backend,
395
+ 'eval_config': args.eval_config,
396
+ }
397
+
398
+ run_task(task_cfg)
399
+
400
+
401
+ if __name__ == '__main__':
402
+ # Usage: python3 evalscope/run.py --model ZhipuAI/chatglm2-6b --datasets mmlu hellaswag --limit 10
403
+ # Usage: python3 evalscope/run.py --model qwen/Qwen-1_8B --generation-config do_sample=false,temperature=0.0 --datasets ceval --dataset-args '{"ceval": {"few_shot_num": 0}}' --limit 10
404
+ main()