evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,88 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+
6
+ class BaseModel(ABC):
7
+
8
+ def __init__(self, model_cfg: dict, **kwargs):
9
+ """
10
+ Base model class.
11
+
12
+ Args:
13
+ model_cfg (dict): The model configuration. Depending on the specific model. Example:
14
+ {'model_id': 'modelscope/Llama-2-7b-chat-ms', 'revision': 'v1.0.0'}
15
+
16
+ **kwargs: kwargs
17
+ """
18
+ self.model_cfg: dict = model_cfg
19
+ self.kwargs = kwargs
20
+
21
+ @abstractmethod
22
+ def predict(self, *args, **kwargs) -> Any:
23
+ """
24
+ Model prediction func.
25
+ """
26
+ raise NotImplementedError
27
+
28
+
29
+ class ChatBaseModel(BaseModel):
30
+
31
+ def __init__(self, model_cfg: dict, **kwargs):
32
+ """
33
+ Chat base model class. Depending on the specific model.
34
+
35
+ Args:
36
+ model_cfg (dict):
37
+ {'model_id': 'modelscope/Llama-2-7b-chat-ms', 'revision': 'v1.0.0', 'device_map': 'auto'}
38
+
39
+ **kwargs: kwargs
40
+ """
41
+ super(ChatBaseModel, self).__init__(model_cfg=model_cfg, **kwargs)
42
+
43
+ @abstractmethod
44
+ def predict(self, inputs: dict, **kwargs) -> dict:
45
+ """
46
+ Model prediction func. The inputs and outputs are compatible with OpenAI Chat Completions APIs.
47
+ Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
48
+
49
+ # TODO: follow latest OpenAI API
50
+
51
+ Args:
52
+ inputs (dict): The input prompts and history. Input format:
53
+ {'messages': [
54
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
55
+ {'role': 'user', 'content': 'Who won the world series in 2020?'},
56
+ {'role': 'assistant', 'content': 'The Los Angeles Dodgers won the World Series in 2020.'},
57
+ ]
58
+ 'history': [
59
+ {'role': 'system', 'content': 'Hello'},
60
+ {'role': 'user', 'content': 'Hi'}]
61
+ }
62
+
63
+ kwargs (dict): Could be inference configuration. Default: None.
64
+ cfg format: {'max_length': 1024}
65
+
66
+ Returns: The result format:
67
+ {
68
+ 'choices': [
69
+ {
70
+ 'index': 0,
71
+ 'message': {
72
+ 'content': 'The 2020 World Series was played in Texas at Globe Life Field in Arlington.',
73
+ 'role': 'assistant'
74
+ }
75
+ }
76
+ ],
77
+ 'created': 1677664795,
78
+ # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
79
+ 'model': 'gpt-3.5-turbo-0613',
80
+ 'object': 'chat.completion',
81
+ 'usage': {
82
+ 'completion_tokens': 17,
83
+ 'prompt_tokens': 57,
84
+ 'total_tokens': 74
85
+ }
86
+ }
87
+ """
88
+ raise NotImplementedError
@@ -0,0 +1,586 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright (c) EleutherAI, Inc. and its affiliates.
3
+ # flake8: noqa
4
+ import os
5
+ import sys
6
+ from typing import List, Any, Union, Dict
7
+ import numpy as np
8
+ import time
9
+ from abc import ABC, abstractmethod
10
+ from copy import deepcopy
11
+
12
+ import torch
13
+ from torch import dtype
14
+
15
+ from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
16
+ from evalscope.models.custom import CustomModel
17
+ from evalscope.models.template import get_template, StopWordsCriteria
18
+ from evalscope.utils.logger import get_logger
19
+ from transformers import StoppingCriteriaList
20
+
21
+ logger = get_logger()
22
+
23
+ # Notes:
24
+ # - modelscope>=1.9.5
25
+
26
+
27
+ def get_model_cache_dir(root_cache_dir: str):
28
+ model_cache_dir = os.path.join(root_cache_dir, 'models')
29
+ model_cache_dir = os.path.expanduser(model_cache_dir)
30
+ os.makedirs(model_cache_dir, exist_ok=True)
31
+ return model_cache_dir
32
+
33
+
34
+ class BaseModelAdapter(ABC):
35
+ """
36
+ Base class for model adapter.
37
+ """
38
+
39
+ def __init__(self, model, tokenizer, model_cfg: dict):
40
+ """
41
+ Args:
42
+ model: The model instance which is compatible with
43
+ AutoModel/AutoModelForCausalLM/AutoModelForSeq2SeqLM of transformers.
44
+ tokenizer: The tokenizer instance which is compatible with AutoTokenizer of transformers.
45
+ model_cfg:
46
+ Attributes: model_id, model_revision, device_map, torch_dtype
47
+ """
48
+ self.model = model
49
+ self.tokenizer = tokenizer
50
+ self.model_cfg = model_cfg
51
+
52
+ @abstractmethod
53
+ @torch.no_grad()
54
+ def predict(self, *args, **kwargs) -> Any:
55
+ """
56
+ Model prediction func.
57
+ """
58
+ raise NotImplementedError
59
+
60
+
61
+ class MultiChoiceModelAdapter(BaseModelAdapter):
62
+ """ The multi-choice model adapter. """
63
+
64
+ _DEFAULT_MAX_LENGTH = 2048
65
+
66
+ def __init__(self,
67
+ model_id: str,
68
+ device_map: str = 'auto',
69
+ torch_dtype: dtype = torch.bfloat16,
70
+ model_revision: str = None,
71
+ max_length: int = None,
72
+ cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
73
+ **kwargs):
74
+ """
75
+ Args:
76
+ model_id: The model id on ModelScope, or local model_dir. TODO: torch.nn.module to be supported.
77
+ device_map: The device map for model inference.
78
+ torch_dtype: The torch dtype for model inference. Default: torch.bfloat16.
79
+ model_revision: The model revision on ModelScope. Default: None.
80
+ max_length: The max length of input sequence. Default: None.
81
+ **kwargs: Other args.
82
+ """
83
+ model_cache_dir = get_model_cache_dir(cache_dir)
84
+
85
+ self.model_id: str = model_id
86
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
87
+ logger.warning(f'**Device: {self.device}')
88
+
89
+ torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
90
+
91
+ model_cfg: dict = dict()
92
+ model_cfg['model_id'] = model_id
93
+ model_cfg['device_map'] = device_map
94
+ model_cfg['torch_dtype'] = str(torch_dtype)
95
+
96
+ from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer
97
+ # from modelscope import snapshot_download
98
+
99
+ # try:
100
+ # model_dir = snapshot_download(self.model_id, cache_dir=model_cache_dir, local_files_only=True)
101
+ # logger.warning('**Use local_files_only to load model **')
102
+ # except:
103
+ # model_dir = snapshot_download(self.model_id,
104
+ # revision=model_revision,
105
+ # cache_dir=model_cache_dir, )
106
+ # logger.warning('**Load model from ModelScope hub **')
107
+
108
+ tokenizer = AutoTokenizer.from_pretrained(self.model_id, # self.model_id
109
+ revision=model_revision,
110
+ trust_remote_code=True,
111
+ cache_dir=model_cache_dir,)
112
+
113
+ model = AutoModelForCausalLM.from_pretrained(self.model_id, # self.model_id
114
+ revision=model_revision,
115
+ device_map=device_map,
116
+ trust_remote_code=True,
117
+ torch_dtype=torch_dtype,
118
+ cache_dir=model_cache_dir,)
119
+
120
+ # model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True)
121
+
122
+ super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
123
+
124
+ self._max_length = max_length
125
+
126
+ @property
127
+ def max_length(self):
128
+ if self._max_length:
129
+ return self._max_length
130
+ seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx')
131
+ for attr in seqlen_config_attrs:
132
+ if hasattr(self.model.config, attr):
133
+ return getattr(self.model.config, attr)
134
+ if hasattr(self.tokenizer, 'model_max_length'):
135
+ if self.tokenizer.model_max_length == 1000000000000000019884624838656:
136
+ return self._DEFAULT_MAX_LENGTH
137
+ return self.tokenizer.model_max_length
138
+ return self._DEFAULT_MAX_LENGTH
139
+
140
+ @torch.no_grad()
141
+ def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
142
+ """
143
+ Multi-choice model prediction func.
144
+
145
+ Args:
146
+ inputs (dict): The inputs for a doc. Format:
147
+ {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
148
+
149
+ infer_cfg (dict): inference configuration.
150
+
151
+ Returns:
152
+ res (dict): The model prediction results. Format:
153
+ {
154
+ 'choices': [
155
+ {
156
+ 'index': 0,
157
+ 'message': {
158
+ 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
159
+ 'role': 'assistant'
160
+ }
161
+ }
162
+ ],
163
+ 'created': 1677664795,
164
+ # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
165
+ 'model': 'gpt-3.5-turbo-0613',
166
+ 'object': 'chat.completion',
167
+ 'usage': {
168
+ 'completion_tokens': 17,
169
+ 'prompt_tokens': 57,
170
+ 'total_tokens': 74
171
+ }
172
+ }
173
+ """
174
+ infer_cfg = infer_cfg or {}
175
+ self.model.generation_config.update(**infer_cfg)
176
+
177
+ input_data = inputs['data']
178
+ multi_choices = inputs['multi_choices']
179
+
180
+ output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
181
+ assert output.shape[0] == 1
182
+ logits = output.flatten()
183
+
184
+ choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
185
+ softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
186
+
187
+ if softval.dtype in {torch.bfloat16, torch.float16}:
188
+ softval = softval.to(dtype=torch.float32)
189
+ probs = softval.detach().cpu().numpy()
190
+ pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D
191
+
192
+ res_d = {
193
+ 'choices': [
194
+ {
195
+ 'index': 0,
196
+ 'message': {
197
+ 'content': pred,
198
+ 'role': 'assistant'
199
+ }
200
+ }
201
+ ],
202
+ 'created': time.time(),
203
+ 'model': self.model_id,
204
+ 'object': 'chat.completion',
205
+ 'usage': {}
206
+ }
207
+
208
+ return res_d
209
+
210
+ @staticmethod
211
+ def _get_logits(tokenizer, model, inputs: List[str]):
212
+ input_ids = tokenizer(inputs, padding=False)['input_ids']
213
+ input_ids = torch.tensor(input_ids, device=model.device)
214
+ tokens = {'input_ids': input_ids}
215
+
216
+ outputs = model(input_ids)['logits']
217
+ logits = outputs[:, -1, :]
218
+ log_probs = torch.nn.functional.softmax(logits, dim=-1)
219
+ return log_probs, {'tokens': tokens}
220
+
221
+
222
+ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
223
+
224
+ def __init__(self,
225
+ model_id: str,
226
+ device_map: str = 'auto',
227
+ torch_dtype: dtype = torch.bfloat16,
228
+ model_revision: str = None,
229
+ cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
230
+ **kwargs):
231
+ """
232
+ Continuation-logits model adapter.
233
+
234
+ Args:
235
+ model_id: The model id on ModelScope, or local model_dir.
236
+ device_map: The device map for model inference.
237
+ torch_dtype: The torch dtype for model inference. Default: torch.bfloat16.
238
+ model_revision: The model revision on ModelScope. Default: None.
239
+ **kwargs: Other args.
240
+ """
241
+
242
+ super().__init__(model_id=model_id,
243
+ device_map=device_map,
244
+ torch_dtype=torch_dtype,
245
+ model_revision=model_revision,
246
+ cache_dir=cache_dir,
247
+ **kwargs)
248
+
249
+ @torch.no_grad()
250
+ def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
251
+ """
252
+ Multi-choice model prediction func.
253
+ Args:
254
+ inputs (dict): The inputs for a doc. Format:
255
+ {'data': [(context, continuation), ...]}
256
+ infer_cfg (dict): inference configuration.
257
+ Returns:
258
+ res (dict): The model prediction results. Format:
259
+ {
260
+ 'choices': [
261
+ {
262
+ 'index': 0,
263
+ 'message': {
264
+ 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
265
+ 'role': 'assistant'
266
+ }
267
+ }
268
+ ],
269
+ 'created': 1677664795,
270
+ # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
271
+ 'model': 'gpt-3.5-turbo-0613',
272
+ 'object': 'chat.completion',
273
+ 'usage': {
274
+ 'completion_tokens': 17,
275
+ 'prompt_tokens': 57,
276
+ 'total_tokens': 74
277
+ }
278
+ }
279
+ """
280
+ infer_cfg = infer_cfg or {}
281
+
282
+ pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
283
+
284
+ res_d = {
285
+ 'choices': [
286
+ {
287
+ 'index': 0,
288
+ 'message': {
289
+ 'content': pred_list,
290
+ 'role': 'assistant'
291
+ }
292
+ }
293
+ ],
294
+ 'created': time.time(),
295
+ 'model': self.model_id,
296
+ 'object': 'chat.completion',
297
+ 'usage': {}
298
+ }
299
+ return res_d
300
+
301
+ def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
302
+ self.model.generation_config.update(**infer_cfg)
303
+ # To predict one doc
304
+ doc_ele_pred = []
305
+ for ctx, continuation in inputs:
306
+
307
+ # ctx_enc shape: [context_tok_len] cont_enc shape: [continuation_tok_len]
308
+ ctx_enc, cont_enc = self._encode_pair(ctx, continuation)
309
+
310
+ inputs_tokens = torch.tensor(
311
+ (ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1],
312
+ dtype=torch.long,
313
+ device=self.model.device).unsqueeze(0)
314
+
315
+ logits = self.model(inputs_tokens)[0]
316
+ logits = torch.nn.functional.log_softmax(logits.float(), dim=-1)
317
+
318
+ logits = logits[:, -len(cont_enc):, :]
319
+ cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1)
320
+ logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1)
321
+
322
+ choice_score = float(logits.sum())
323
+ doc_ele_pred.append(choice_score)
324
+
325
+ # e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices)
326
+ return doc_ele_pred
327
+
328
+ def _encode_pair(self, context, continuation):
329
+ n_spaces = len(context) - len(context.rstrip())
330
+ if n_spaces > 0:
331
+ continuation = context[-n_spaces:] + continuation
332
+ context = context[:-n_spaces]
333
+
334
+ whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids']
335
+ whole_enc = torch.tensor(whole_enc, device=self.device)
336
+
337
+ context_enc = self.tokenizer(context, padding=False)['input_ids']
338
+ context_enc = torch.tensor(context_enc, device=self.device)
339
+
340
+ context_enc_len = len(context_enc)
341
+ continuation_enc = whole_enc[context_enc_len:]
342
+
343
+ return context_enc, continuation_enc
344
+
345
+
346
+ class ChatGenerationModelAdapter(BaseModelAdapter):
347
+
348
+ def __init__(self,
349
+ model_id: str,
350
+ model_revision: str,
351
+ device_map: str = 'auto',
352
+ torch_dtype: dtype = torch.float16,
353
+ cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
354
+ **kwargs):
355
+ """
356
+ Chat completion model adapter. Tasks of chat and generation are supported.
357
+
358
+ Args:
359
+ model_id: The model id on ModelScope, or local model_dir.
360
+ model_revision: The model revision on ModelScope. Default: None.
361
+ device_map: The device map for model inference.
362
+ torch_dtype: The torch dtype for model inference. Default: torch.float16.
363
+ **kwargs: Other args.
364
+ """
365
+ model_cache_dir = get_model_cache_dir(root_cache_dir=cache_dir)
366
+
367
+ self.model_id: str = model_id
368
+ self.model_revision: str = model_revision
369
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
370
+ logger.warning(f'**Device: {self.device}')
371
+
372
+ torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
373
+
374
+ model_cfg: dict = dict()
375
+ model_cfg['model_id'] = model_id
376
+ model_cfg['device_map'] = device_map
377
+ model_cfg['torch_dtype'] = str(torch_dtype)
378
+
379
+ self.template_type = kwargs.pop('template_type', None)
380
+ logger.warning(f'**Template type: {self.template_type}')
381
+
382
+ from evalscope.models.template import TemplateType
383
+ if isinstance(self.model_id, str) \
384
+ and os.path.isdir(os.path.expanduser(self.model_id)) \
385
+ and self.template_type is None:
386
+ raise ValueError(f'Please specify the --template-type for local model dir.\n'
387
+ f'Available template types: {TemplateType.get_template_name_list()}\n'
388
+ f'Refer to `https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md` for more details.')
389
+
390
+ from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer
391
+ # from modelscope import snapshot_download
392
+
393
+ # try:
394
+ # model_dir = snapshot_download(self.model_id, cache_dir=model_cache_dir, local_files_only=True)
395
+ # logger.warning('**Use local_files_only to load model **')
396
+ # except:
397
+ # model_dir = snapshot_download(self.model_id,
398
+ # revision=model_revision,
399
+ # cache_dir=model_cache_dir, )
400
+ # logger.warning('**Load model from ModelScope hub **')
401
+
402
+ tokenizer = AutoTokenizer.from_pretrained(self.model_id,
403
+ revision=model_revision,
404
+ trust_remote_code=True,
405
+ cache_dir=model_cache_dir,)
406
+
407
+ model = AutoModelForCausalLM.from_pretrained(self.model_id,
408
+ revision=model_revision,
409
+ device_map=device_map,
410
+ trust_remote_code=True,
411
+ torch_dtype=torch_dtype,
412
+ cache_dir=model_cache_dir,)
413
+
414
+ self.origin_tokenizer = deepcopy(tokenizer)
415
+
416
+ self.generation_config, self.generation_template = self._parse_generation_config(tokenizer, model)
417
+ logger.info(f'**Generation config init: {self.generation_config.to_dict()}')
418
+
419
+ super().__init__(model=model, tokenizer=self.generation_template.tokenizer, model_cfg=model_cfg)
420
+
421
+ def _parse_generation_config(self, tokenizer, model):
422
+ from modelscope.utils.hf_util import GenerationConfig
423
+
424
+ generation_config = getattr(model, 'generation_config', GenerationConfig())
425
+
426
+ try:
427
+ remote_config = GenerationConfig.from_pretrained(
428
+ self.model_id,
429
+ revision=self.model_revision,
430
+ trust_remote_code=True)
431
+ generation_config.update(**remote_config.to_dict())
432
+ except:
433
+ logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
434
+
435
+ # Parse templates for chat-completion
436
+ if isinstance(self.model_id, str) and os.path.exists(self.model_id):
437
+ logger.warning(f'Got local model dir: {self.model_id}')
438
+
439
+ generation_template = get_template(template_type=self.template_type, tokenizer=tokenizer)
440
+
441
+ if tokenizer.eos_token_id is not None:
442
+ generation_config.eos_token_id = tokenizer.eos_token_id
443
+ if tokenizer.pad_token_id is not None:
444
+ generation_config.pad_token_id = tokenizer.pad_token_id
445
+ if generation_config.max_new_tokens is None:
446
+ generation_config.max_new_tokens = 2048
447
+
448
+ return generation_config, generation_template
449
+
450
+ def _model_generate(self, query: str, infer_cfg: dict) -> str:
451
+ example = dict(query=query,
452
+ history=[],
453
+ system=None)
454
+
455
+ inputs, _ = self.generation_template.encode(example)
456
+ input_ids = inputs['input_ids']
457
+ input_ids = torch.tensor(input_ids)[None].to(self.device)
458
+ attention_mask = torch.ones_like(input_ids).to(self.device)
459
+
460
+ # Process infer_cfg
461
+ infer_cfg = infer_cfg or {}
462
+ if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
463
+ infer_cfg['do_sample'] = True
464
+
465
+ # TODO: stop settings
466
+ stop = infer_cfg.get('stop', None)
467
+ eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
468
+ if stop else self.tokenizer.eos_token_id
469
+
470
+ if eos_token_id is not None:
471
+ infer_cfg['eos_token_id'] = eos_token_id
472
+ infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
473
+
474
+ self.generation_config.update(**infer_cfg)
475
+
476
+ # stopping
477
+ stop_words = [self.generation_template.suffix[-1]]
478
+ decode_kwargs = {}
479
+ stopping_criteria = StoppingCriteriaList(
480
+ [StopWordsCriteria(self.tokenizer, stop_words, **decode_kwargs)])
481
+
482
+ # Run inference
483
+ output_ids = self.model.generate(
484
+ input_ids=input_ids,
485
+ attention_mask=attention_mask,
486
+ generation_config=self.generation_config,
487
+ stopping_criteria=stopping_criteria, )
488
+
489
+ response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], True, **decode_kwargs)
490
+ return response
491
+
492
+ @torch.no_grad()
493
+ def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = dict({})) -> dict:
494
+
495
+ # Process inputs
496
+ if isinstance(inputs, str):
497
+ query = inputs
498
+ elif isinstance(inputs, dict):
499
+ query = inputs['data'][0]
500
+ elif isinstance(inputs, list):
501
+ query = '\n'.join(inputs)
502
+ else:
503
+ raise TypeError(f'Unsupported inputs type: {type(inputs)}')
504
+
505
+ response = self._model_generate(query, infer_cfg)
506
+
507
+ choices_list = [
508
+ {'index': 0,
509
+ 'message': {'content': response,
510
+ 'role': 'assistant'}
511
+ }
512
+ ]
513
+
514
+ res_d = {
515
+ 'choices': choices_list,
516
+ 'created': time.time(),
517
+ 'model': self.model_id,
518
+ 'object': 'chat.completion',
519
+ 'usage': {}
520
+ }
521
+
522
+ return res_d
523
+
524
+
525
+ class CustomModelAdapter(BaseModelAdapter):
526
+
527
+ def __init__(self, custom_model: CustomModel, **kwargs):
528
+ """
529
+ Custom model adapter.
530
+
531
+ Args:
532
+ custom_model: The custom model instance.
533
+ **kwargs: Other args.
534
+ """
535
+ self.custom_model = custom_model
536
+ super(CustomModelAdapter, self).__init__(model=None, tokenizer=None, model_cfg=custom_model.config)
537
+
538
+ def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
539
+ """
540
+ Model prediction func.
541
+
542
+ Args:
543
+ inputs (Union[str, dict, list]): The input data. Depending on the specific model.
544
+ str: 'xxx'
545
+ dict: {'data': [full_prompt]}
546
+ list: ['xxx', 'yyy', 'zzz']
547
+ **kwargs: kwargs
548
+
549
+ Returns:
550
+ res (dict): The model prediction results. Format:
551
+ {
552
+ 'choices': [
553
+ {
554
+ 'index': 0,
555
+ 'message': {
556
+ 'content': 'xxx',
557
+ 'role': 'assistant'
558
+ }
559
+ }
560
+ ],
561
+ 'created': 1677664795,
562
+ 'model': 'gpt-3.5-turbo-0613', # should be model_id
563
+ 'object': 'chat.completion',
564
+ 'usage': {
565
+ 'completion_tokens': 17,
566
+ 'prompt_tokens': 57,
567
+ 'total_tokens': 74
568
+ }
569
+ }
570
+ """
571
+ in_prompts = []
572
+
573
+ # Note: here we assume the inputs are all prompts for the benchmark.
574
+ for input_prompt in inputs:
575
+ if isinstance(input_prompt, str):
576
+ in_prompts.append(input_prompt)
577
+ elif isinstance(input_prompt, dict):
578
+ # TODO: to be supported for continuation list like truthful_qa
579
+ in_prompts.append(input_prompt['data'][0])
580
+ elif isinstance(input_prompt, list):
581
+ in_prompts.append('\n'.join(input_prompt))
582
+ else:
583
+ raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
584
+
585
+ return self.custom_model.predict(prompts=in_prompts, **kwargs)
586
+