evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py CHANGED
@@ -4,6 +4,7 @@ import subprocess
4
4
  import torch
5
5
  import unittest
6
6
 
7
+ from evalscope.constants import EvalType
7
8
  from evalscope.run import run_task
8
9
  from evalscope.utils import is_module_installed, test_level_list
9
10
  from evalscope.utils.logger import get_logger
@@ -70,7 +71,19 @@ class TestRun(unittest.TestCase):
70
71
 
71
72
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
72
73
  def test_run_task(self):
73
- task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['bbh', 'gsm8k', 'arc'], 'limit': 2, 'debug': False}
74
+ task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
75
+ 'datasets': [
76
+ 'mmlu_pro',
77
+ # 'bbh',
78
+ 'hellaswag',
79
+ # 'gsm8k',
80
+ # 'arc'
81
+ # 'race',
82
+ # 'truthful_qa',
83
+ # 'trivia_qa',
84
+ ],
85
+ 'limit': 20,
86
+ 'debug': True}
74
87
  run_task(task_cfg=task_cfg)
75
88
 
76
89
 
@@ -110,5 +123,43 @@ class TestRun(unittest.TestCase):
110
123
 
111
124
  run_task(task_cfg=task_cfg)
112
125
 
126
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
127
+ def test_run_server_model(self):
128
+ from evalscope.config import TaskConfig
129
+
130
+ task_cfg = TaskConfig(
131
+ model='Qwen2.5-7B-Instruct',
132
+ api_url='http://127.0.0.1:8801/v1/chat/completions',
133
+ api_key='EMPTY',
134
+ eval_type=EvalType.SERVICE,
135
+ datasets=[
136
+ 'iquiz',
137
+ # 'ifeval',
138
+ # 'mmlu',
139
+ # 'mmlu_pro',
140
+ # 'race',
141
+ # 'trivia_qa',
142
+ # 'cmmlu',
143
+ # 'humaneval',
144
+ # 'competition_math',
145
+ # 'gsm8k',
146
+ # 'arc',
147
+ # 'ceval',
148
+ # 'bbh',
149
+ # 'hellaswag',
150
+ ],
151
+ dataset_args={
152
+ 'ceval': {
153
+ 'subset_list': [
154
+ 'computer_network', 'operating_system', 'computer_architecture', 'college_programming'
155
+ ]
156
+ }
157
+ },
158
+ # limit=10
159
+ )
160
+
161
+ run_task(task_cfg=task_cfg)
162
+
163
+
113
164
  if __name__ == '__main__':
114
165
  unittest.main()
tests/rag/test_mteb.py CHANGED
@@ -79,7 +79,7 @@ class TestMTEB(unittest.TestCase):
79
79
  },
80
80
  },
81
81
  {
82
- 'model_name_or_path': 'OpenBMB/MiniCPM-Reranker',
82
+ 'model_name_or_path': 'BAAI/bge-reranker-v2-m3',
83
83
  'is_cross_encoder': True,
84
84
  'max_seq_length': 512,
85
85
  'prompt': '为这个问题生成一个检索用的表示',
@@ -94,7 +94,8 @@ class TestMTEB(unittest.TestCase):
94
94
  'verbosity': 2,
95
95
  'output_folder': 'outputs',
96
96
  'overwrite_results': True,
97
- 'limits': 10,
97
+ # 'limits': 10,
98
+ 'top_k': 10,
98
99
  },
99
100
  },
100
101
  }
@@ -1,3 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.models.api.openai_api import OpenaiApi
@@ -1,49 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import random
4
- import time
5
-
6
- from evalscope.models import ChatBaseModel
7
- from evalscope.utils.logger import get_logger
8
-
9
- logger = get_logger()
10
-
11
-
12
- class DummyChatModel(ChatBaseModel):
13
-
14
- MODEL_ID = 'dummy_chat_model_0801'
15
- REVISION = 'v1.0.0'
16
-
17
- def __init__(self, model_cfg: dict, **kwargs):
18
- model_cfg['model_id'] = self.MODEL_ID
19
- model_cfg['revision'] = self.REVISION
20
- super(DummyChatModel, self).__init__(model_cfg=model_cfg)
21
-
22
- def predict(self, inputs: dict, **kwargs) -> dict:
23
-
24
- debug: bool = False
25
- if debug:
26
- messages = inputs['messages']
27
- history = inputs['history']
28
-
29
- logger.info(f'** messages: {messages}')
30
- logger.info(f'** history: {history}')
31
-
32
- choice = random.choice(['A', 'B', 'C', 'D'])
33
-
34
- # Build response
35
- res = {
36
- 'choices': [{
37
- 'index': 0,
38
- 'message': {
39
- 'content': choice,
40
- 'role': 'assistant'
41
- }
42
- }],
43
- 'created': time.time(),
44
- 'model': self.MODEL_ID + '-' + self.REVISION,
45
- 'object': 'chat.completion',
46
- 'usage': {}
47
- }
48
-
49
- return res
@@ -1,525 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # Copyright (c) EleutherAI, Inc. and its affiliates.
3
- # flake8: noqa
4
- import numpy as np
5
- import os
6
- import sys
7
- import time
8
- import torch
9
- from abc import ABC, abstractmethod
10
- from copy import deepcopy
11
- from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
12
- from torch import dtype
13
- from typing import Any, Dict, List, Union
14
-
15
- from evalscope.constants import DEFAULT_MODEL_CACHE_DIR
16
- from evalscope.models.custom import CustomModel
17
- from evalscope.utils.chat_service import ChatMessage
18
- from evalscope.utils.logger import get_logger
19
- from evalscope.utils.model_utils import fix_do_sample_warning
20
-
21
- logger = get_logger()
22
-
23
-
24
- class BaseModelAdapter(ABC):
25
- """
26
- Base class for model adapter.
27
- """
28
-
29
- def __init__(self, model, tokenizer, model_cfg: dict):
30
- """
31
- Args:
32
- model: The model instance which is compatible with
33
- AutoModel/AutoModelForCausalLM/AutoModelForSeq2SeqLM of transformers.
34
- tokenizer: The tokenizer instance which is compatible with AutoTokenizer of transformers.
35
- model_cfg:
36
- Attributes: model_id, model_revision, device_map, torch_dtype
37
- """
38
- self.model = model
39
- self.tokenizer = tokenizer
40
- self.model_cfg = model_cfg
41
-
42
- @abstractmethod
43
- @torch.no_grad()
44
- def predict(self, *args, **kwargs) -> Any:
45
- """
46
- Model prediction func.
47
- """
48
- raise NotImplementedError
49
-
50
-
51
- class MultiChoiceModelAdapter(BaseModelAdapter):
52
- """ The multi-choice model adapter. """
53
-
54
- _DEFAULT_MAX_LENGTH = 2048
55
-
56
- def __init__(self,
57
- model_id: str,
58
- device_map: str = 'auto',
59
- torch_dtype: dtype = torch.bfloat16,
60
- model_revision: str = None,
61
- max_length: int = None,
62
- cache_dir: str = None,
63
- **kwargs):
64
- """
65
- Args:
66
- model_id: The model id on ModelScope, or local model_dir. TODO: torch.nn.module to be supported.
67
- device_map: The device map for model inference.
68
- torch_dtype: The torch dtype for model inference. Default: torch.bfloat16.
69
- model_revision: The model revision on ModelScope. Default: None.
70
- max_length: The max length of input sequence. Default: None.
71
- **kwargs: Other args.
72
- """
73
- model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
74
-
75
- self.model_id: str = model_id
76
- self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
77
- logger.warning(f'Device: {self.device}')
78
-
79
- torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
80
-
81
- model_cfg: dict = dict()
82
- model_cfg['model_id'] = model_id
83
- model_cfg['device_map'] = device_map
84
- model_cfg['torch_dtype'] = str(torch_dtype)
85
-
86
- tokenizer = AutoTokenizer.from_pretrained(
87
- self.model_id, # self.model_id
88
- revision=model_revision,
89
- trust_remote_code=True,
90
- cache_dir=model_cache_dir,
91
- )
92
-
93
- model = AutoModelForCausalLM.from_pretrained(
94
- self.model_id, # self.model_id
95
- revision=model_revision,
96
- device_map=device_map,
97
- trust_remote_code=True,
98
- torch_dtype=torch_dtype,
99
- cache_dir=model_cache_dir,
100
- )
101
-
102
- super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
103
-
104
- self._max_length = max_length
105
-
106
- @property
107
- def max_length(self):
108
- if self._max_length:
109
- return self._max_length
110
- seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx')
111
- for attr in seqlen_config_attrs:
112
- if hasattr(self.model.config, attr):
113
- return getattr(self.model.config, attr)
114
- if hasattr(self.tokenizer, 'model_max_length'):
115
- if self.tokenizer.model_max_length == 1000000000000000019884624838656:
116
- return self._DEFAULT_MAX_LENGTH
117
- return self.tokenizer.model_max_length
118
- return self._DEFAULT_MAX_LENGTH
119
-
120
- @torch.no_grad()
121
- def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
122
- """
123
- Multi-choice model prediction func.
124
-
125
- Args:
126
- inputs (dict): The inputs for a doc. Format:
127
- {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
128
-
129
- infer_cfg (dict): inference configuration.
130
-
131
- Returns:
132
- res (dict): The model prediction results. Format:
133
- {
134
- 'choices': [
135
- {
136
- 'index': 0,
137
- 'message': {
138
- 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
139
- 'role': 'assistant'
140
- }
141
- }
142
- ],
143
- 'created': 1677664795,
144
- # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
145
- 'model': 'gpt-3.5-turbo-0613',
146
- 'object': 'chat.completion',
147
- 'usage': {
148
- 'completion_tokens': 17,
149
- 'prompt_tokens': 57,
150
- 'total_tokens': 74
151
- }
152
- }
153
- """
154
- infer_cfg = infer_cfg or {}
155
- self.model.generation_config.update(**infer_cfg)
156
-
157
- input_data = inputs['data']
158
- multi_choices = inputs['multi_choices']
159
-
160
- output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
161
- assert output.shape[0] == 1
162
- logits = output.flatten()
163
-
164
- choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
165
- softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
166
-
167
- if softval.dtype in {torch.bfloat16, torch.float16}:
168
- softval = softval.to(dtype=torch.float32)
169
- probs = softval.detach().cpu().numpy()
170
- pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D
171
-
172
- res_d = {
173
- 'choices': [{
174
- 'index': 0,
175
- 'message': {
176
- 'content': pred,
177
- 'role': 'assistant'
178
- }
179
- }],
180
- 'created': time.time(),
181
- 'model': self.model_id,
182
- 'object': 'chat.completion',
183
- 'usage': {}
184
- }
185
-
186
- return res_d
187
-
188
- @staticmethod
189
- def _get_logits(tokenizer, model, inputs: List[str]):
190
- input_ids = tokenizer(inputs, padding=False)['input_ids']
191
- input_ids = torch.tensor(input_ids, device=model.device)
192
- tokens = {'input_ids': input_ids}
193
-
194
- outputs = model(input_ids)['logits']
195
- logits = outputs[:, -1, :]
196
- log_probs = torch.nn.functional.softmax(logits, dim=-1)
197
- return log_probs, {'tokens': tokens}
198
-
199
-
200
- class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
201
-
202
- def __init__(self,
203
- model_id: str,
204
- device_map: str = 'auto',
205
- torch_dtype: dtype = torch.bfloat16,
206
- model_revision: str = None,
207
- cache_dir: str = None,
208
- **kwargs):
209
- """
210
- Continuation-logits model adapter.
211
-
212
- Args:
213
- model_id: The model id on ModelScope, or local model_dir.
214
- device_map: The device map for model inference.
215
- torch_dtype: The torch dtype for model inference. Default: torch.bfloat16.
216
- model_revision: The model revision on ModelScope. Default: None.
217
- **kwargs: Other args.
218
- """
219
-
220
- super().__init__(
221
- model_id=model_id,
222
- device_map=device_map,
223
- torch_dtype=torch_dtype,
224
- model_revision=model_revision,
225
- cache_dir=cache_dir,
226
- **kwargs)
227
-
228
- @torch.no_grad()
229
- def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
230
- """
231
- Multi-choice model prediction func.
232
- Args:
233
- inputs (dict): The inputs for a doc. Format:
234
- {'data': [(context, continuation), ...]}
235
- infer_cfg (dict): inference configuration.
236
- Returns:
237
- res (dict): The model prediction results. Format:
238
- {
239
- 'choices': [
240
- {
241
- 'index': 0,
242
- 'message': {
243
- 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
244
- 'role': 'assistant'
245
- }
246
- }
247
- ],
248
- 'created': 1677664795,
249
- # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
250
- 'model': 'gpt-3.5-turbo-0613',
251
- 'object': 'chat.completion',
252
- 'usage': {
253
- 'completion_tokens': 17,
254
- 'prompt_tokens': 57,
255
- 'total_tokens': 74
256
- }
257
- }
258
- """
259
- infer_cfg = infer_cfg or {}
260
-
261
- pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
262
-
263
- res_d = {
264
- 'choices': [{
265
- 'index': 0,
266
- 'message': {
267
- 'content': pred_list,
268
- 'role': 'assistant'
269
- }
270
- }],
271
- 'created': time.time(),
272
- 'model': self.model_id,
273
- 'object': 'chat.completion',
274
- 'usage': {}
275
- }
276
- return res_d
277
-
278
- def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
279
- self.model.generation_config.update(**infer_cfg)
280
- # To predict one doc
281
- doc_ele_pred = []
282
- for ctx, continuation in inputs:
283
-
284
- # ctx_enc shape: [context_tok_len] cont_enc shape: [continuation_tok_len]
285
- ctx_enc, cont_enc = self._encode_pair(ctx, continuation)
286
-
287
- inputs_tokens = torch.tensor(
288
- (ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1],
289
- dtype=torch.long,
290
- device=self.model.device).unsqueeze(0)
291
-
292
- logits = self.model(inputs_tokens)[0]
293
- logits = torch.nn.functional.log_softmax(logits.float(), dim=-1)
294
-
295
- logits = logits[:, -len(cont_enc):, :]
296
- cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1)
297
- logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1)
298
-
299
- choice_score = float(logits.sum())
300
- doc_ele_pred.append(choice_score)
301
-
302
- # e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices)
303
- return doc_ele_pred
304
-
305
- def _encode_pair(self, context, continuation):
306
- n_spaces = len(context) - len(context.rstrip())
307
- if n_spaces > 0:
308
- continuation = context[-n_spaces:] + continuation
309
- context = context[:-n_spaces]
310
-
311
- whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids']
312
- whole_enc = torch.tensor(whole_enc, device=self.device)
313
-
314
- context_enc = self.tokenizer(context, padding=False)['input_ids']
315
- context_enc = torch.tensor(context_enc, device=self.device)
316
-
317
- context_enc_len = len(context_enc)
318
- continuation_enc = whole_enc[context_enc_len:]
319
-
320
- return context_enc, continuation_enc
321
-
322
-
323
- class ChatGenerationModelAdapter(BaseModelAdapter):
324
-
325
- def __init__(self,
326
- model_id: str,
327
- model_revision: str = 'master',
328
- device_map: str = 'auto',
329
- torch_dtype: dtype = 'auto',
330
- cache_dir: str = None,
331
- **kwargs):
332
- """
333
- Chat completion model adapter. Tasks of chat and generation are supported.
334
-
335
- Args:
336
- model_id: The model id on ModelScope, or local model_dir.
337
- model_revision: The model revision on ModelScope. Default: None.
338
- device_map: The device map for model inference.
339
- torch_dtype: The torch dtype for model inference. Default: 'auto'.
340
- **kwargs: Other args.
341
- """
342
-
343
- custom_generation_config = kwargs.pop('generation_config', None)
344
- custom_chat_template = kwargs.pop('chat_template', None)
345
- model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
346
-
347
- self.model_id: str = model_id
348
- self.model_revision: str = model_revision
349
- self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
350
- logger.warning(f'Device: {self.device}')
351
-
352
- torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
353
-
354
- model_cfg: dict = dict()
355
- model_cfg['model_id'] = model_id
356
- model_cfg['device_map'] = device_map
357
- model_cfg['torch_dtype'] = str(torch_dtype)
358
-
359
- tokenizer = AutoTokenizer.from_pretrained(
360
- self.model_id,
361
- revision=model_revision,
362
- trust_remote_code=True,
363
- cache_dir=model_cache_dir,
364
- )
365
-
366
- model = AutoModelForCausalLM.from_pretrained(
367
- self.model_id,
368
- revision=model_revision,
369
- device_map=device_map,
370
- trust_remote_code=True,
371
- torch_dtype=torch_dtype,
372
- cache_dir=model_cache_dir,
373
- )
374
-
375
- self.generation_config = self._parse_generation_config(tokenizer, model)
376
-
377
- if custom_generation_config:
378
- logger.info('Updating generation config ...')
379
- self.generation_config.update(**custom_generation_config)
380
-
381
- if custom_chat_template:
382
- tokenizer.chat_template = custom_chat_template
383
- logger.info(f'Using custom chat template: {custom_chat_template}')
384
-
385
- super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
386
-
387
- def _parse_generation_config(self, tokenizer, model):
388
- generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
389
-
390
- try:
391
- remote_config = GenerationConfig.from_pretrained(
392
- self.model_id, revision=self.model_revision, trust_remote_code=True)
393
- generation_config.update(**remote_config.to_dict())
394
- except:
395
- logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
396
-
397
- if isinstance(self.model_id, str) and os.path.exists(self.model_id):
398
- logger.warning(f'Got local model dir: {self.model_id}')
399
-
400
- if tokenizer.eos_token_id is not None:
401
- generation_config.eos_token_id = tokenizer.eos_token_id
402
- if tokenizer.pad_token_id is not None:
403
- generation_config.pad_token_id = tokenizer.pad_token_id
404
- if generation_config.max_new_tokens is None:
405
- generation_config.max_new_tokens = 2048
406
-
407
- return generation_config
408
-
409
- def _model_generate(self, query: str, infer_cfg: dict) -> str:
410
- messages = [ChatMessage(role='user', content=query)]
411
- formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
412
- inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
413
- input_ids = inputs['input_ids']
414
-
415
- # Process infer_cfg
416
- if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
417
- infer_cfg['do_sample'] = True
418
-
419
- # stop settings
420
- stop = infer_cfg.get('stop', None)
421
- eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
422
- if stop else self.tokenizer.eos_token_id
423
-
424
- if eos_token_id is not None:
425
- infer_cfg['eos_token_id'] = eos_token_id
426
- infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
427
-
428
- self.generation_config.update(**infer_cfg)
429
- fix_do_sample_warning(self.generation_config)
430
-
431
- # Run inference
432
- output_ids = self.model.generate(input_ids, generation_config=self.generation_config)
433
-
434
- response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
435
- return response
436
-
437
- @torch.no_grad()
438
- def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
439
-
440
- # Process inputs
441
- if isinstance(inputs, str):
442
- query = inputs
443
- elif isinstance(inputs, dict):
444
- query = inputs['data'][0]
445
- elif isinstance(inputs, list):
446
- query = '\n'.join(inputs)
447
- else:
448
- raise TypeError(f'Unsupported inputs type: {type(inputs)}')
449
-
450
- response = self._model_generate(query, infer_cfg)
451
-
452
- choices_list = [{'index': 0, 'message': {'content': response, 'role': 'assistant'}}]
453
-
454
- res_d = {
455
- 'choices': choices_list,
456
- 'created': time.time(),
457
- 'model': self.model_id,
458
- 'object': 'chat.completion',
459
- 'usage': {}
460
- }
461
-
462
- return res_d
463
-
464
-
465
- class CustomModelAdapter(BaseModelAdapter):
466
-
467
- def __init__(self, custom_model: CustomModel, **kwargs):
468
- """
469
- Custom model adapter.
470
-
471
- Args:
472
- custom_model: The custom model instance.
473
- **kwargs: Other args.
474
- """
475
- self.custom_model = custom_model
476
- super(CustomModelAdapter, self).__init__(model=None, tokenizer=None, model_cfg=custom_model.config)
477
-
478
- def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
479
- """
480
- Model prediction func.
481
-
482
- Args:
483
- inputs (Union[str, dict, list]): The input data. Depending on the specific model.
484
- str: 'xxx'
485
- dict: {'data': [full_prompt]}
486
- list: ['xxx', 'yyy', 'zzz']
487
- **kwargs: kwargs
488
-
489
- Returns:
490
- res (dict): The model prediction results. Format:
491
- {
492
- 'choices': [
493
- {
494
- 'index': 0,
495
- 'message': {
496
- 'content': 'xxx',
497
- 'role': 'assistant'
498
- }
499
- }
500
- ],
501
- 'created': 1677664795,
502
- 'model': 'gpt-3.5-turbo-0613', # should be model_id
503
- 'object': 'chat.completion',
504
- 'usage': {
505
- 'completion_tokens': 17,
506
- 'prompt_tokens': 57,
507
- 'total_tokens': 74
508
- }
509
- }
510
- """
511
- in_prompts = []
512
-
513
- # Note: here we assume the inputs are all prompts for the benchmark.
514
- for input_prompt in inputs:
515
- if isinstance(input_prompt, str):
516
- in_prompts.append(input_prompt)
517
- elif isinstance(input_prompt, dict):
518
- # TODO: to be supported for continuation list like truthful_qa
519
- in_prompts.append(input_prompt['data'][0])
520
- elif isinstance(input_prompt, list):
521
- in_prompts.append('\n'.join(input_prompt))
522
- else:
523
- raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
524
-
525
- return self.custom_model.predict(prompts=in_prompts, **kwargs)