sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sglang/bench_one_batch.py +1 -11
  2. sglang/bench_serving.py +149 -1
  3. sglang/lang/chat_template.py +44 -0
  4. sglang/srt/configs/deepseekvl2.py +3 -0
  5. sglang/srt/configs/device_config.py +1 -1
  6. sglang/srt/configs/internvl.py +696 -0
  7. sglang/srt/configs/janus_pro.py +3 -0
  8. sglang/srt/configs/model_config.py +17 -0
  9. sglang/srt/constrained/xgrammar_backend.py +11 -19
  10. sglang/srt/conversation.py +30 -3
  11. sglang/srt/disaggregation/decode.py +4 -1
  12. sglang/srt/disaggregation/mini_lb.py +74 -23
  13. sglang/srt/disaggregation/mooncake/conn.py +9 -18
  14. sglang/srt/disaggregation/nixl/conn.py +241 -71
  15. sglang/srt/disaggregation/utils.py +44 -1
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
  17. sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
  18. sglang/srt/distributed/device_communicators/pynccl.py +2 -1
  19. sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
  20. sglang/srt/distributed/parallel_state.py +22 -1
  21. sglang/srt/entrypoints/engine.py +14 -2
  22. sglang/srt/entrypoints/http_server.py +28 -1
  23. sglang/srt/entrypoints/verl_engine.py +3 -2
  24. sglang/srt/hf_transformers_utils.py +20 -1
  25. sglang/srt/layers/attention/flashattention_backend.py +146 -50
  26. sglang/srt/layers/attention/flashinfer_backend.py +23 -13
  27. sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
  28. sglang/srt/layers/attention/merge_state.py +46 -0
  29. sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
  30. sglang/srt/layers/attention/vision.py +290 -163
  31. sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
  32. sglang/srt/layers/moe/ep_moe/layer.py +120 -1
  33. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +4 -1
  37. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
  38. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
  39. sglang/srt/layers/quantization/deep_gemm.py +5 -0
  40. sglang/srt/layers/quantization/fp8.py +108 -95
  41. sglang/srt/layers/quantization/fp8_kernel.py +79 -60
  42. sglang/srt/layers/quantization/fp8_utils.py +71 -23
  43. sglang/srt/layers/quantization/kv_cache.py +3 -10
  44. sglang/srt/layers/quantization/utils.py +0 -5
  45. sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
  46. sglang/srt/lora/lora_manager.py +10 -13
  47. sglang/srt/managers/cache_controller.py +115 -119
  48. sglang/srt/managers/io_struct.py +10 -0
  49. sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
  50. sglang/srt/managers/multimodal_processors/internvl.py +232 -0
  51. sglang/srt/managers/schedule_batch.py +19 -1
  52. sglang/srt/managers/schedule_policy.py +11 -5
  53. sglang/srt/managers/scheduler.py +28 -13
  54. sglang/srt/managers/tokenizer_manager.py +24 -13
  55. sglang/srt/managers/tp_worker.py +9 -12
  56. sglang/srt/mem_cache/chunk_cache.py +2 -0
  57. sglang/srt/mem_cache/memory_pool.py +2 -2
  58. sglang/srt/model_executor/model_runner.py +44 -33
  59. sglang/srt/model_loader/loader.py +18 -11
  60. sglang/srt/models/clip.py +4 -4
  61. sglang/srt/models/deepseek_janus_pro.py +1 -1
  62. sglang/srt/models/deepseek_nextn.py +1 -20
  63. sglang/srt/models/deepseek_v2.py +55 -20
  64. sglang/srt/models/gemma3_mm.py +1 -1
  65. sglang/srt/models/internlm2.py +3 -0
  66. sglang/srt/models/internvl.py +670 -0
  67. sglang/srt/models/llama.py +1 -1
  68. sglang/srt/models/llama4.py +53 -7
  69. sglang/srt/models/minicpmv.py +1 -1
  70. sglang/srt/models/mllama.py +1 -1
  71. sglang/srt/models/phi3_small.py +16 -2
  72. sglang/srt/models/qwen2_5_vl.py +8 -4
  73. sglang/srt/models/qwen2_vl.py +4 -4
  74. sglang/srt/models/xiaomi_mimo.py +171 -0
  75. sglang/srt/openai_api/adapter.py +24 -40
  76. sglang/srt/openai_api/protocol.py +28 -16
  77. sglang/srt/reasoning_parser.py +2 -2
  78. sglang/srt/sampling/sampling_batch_info.py +54 -2
  79. sglang/srt/sampling/sampling_params.py +2 -0
  80. sglang/srt/server_args.py +30 -6
  81. sglang/srt/utils.py +35 -1
  82. sglang/test/test_block_fp8.py +2 -2
  83. sglang/test/test_deepep_utils.py +219 -0
  84. sglang/test/test_utils.py +3 -1
  85. sglang/version.py +1 -1
  86. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +14 -6
  87. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +90 -80
  88. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
  89. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
  90. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,696 @@
1
+ import copy
2
+ import os
3
+ from shutil import copyfile
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
6
+ import sentencepiece as spm
7
+ from transformers import (
8
+ TOKENIZER_MAPPING,
9
+ LlamaConfig,
10
+ Phi3Config,
11
+ PretrainedConfig,
12
+ PreTrainedTokenizer,
13
+ PreTrainedTokenizerFast,
14
+ Qwen2Config,
15
+ )
16
+
17
+ from sglang.utils import logger
18
+
19
+ # Copied from: https://github.com/OpenGVLab/InternVL/blob/34a81000402bf8f716bab8c9b57aff1f6b436bd0/internvl_chat/internvl/model/internvl_chat/configuration_internvl_chat.py#L21
20
+
21
+
22
+ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
23
+
24
+ PRETRAINED_VOCAB_FILES_MAP = {}
25
+
26
+
27
+ # Modified from transformers.model.llama.configuration_llama.LlamaConfig
28
+ class InternLM2Config(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
31
+ an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
32
+ configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
33
+
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+
37
+
38
+ Args:
39
+ vocab_size (`int`, *optional*, defaults to 32000):
40
+ Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
41
+ `inputs_ids` passed when calling [`InternLM2Model`]
42
+ hidden_size (`int`, *optional*, defaults to 4096):
43
+ Dimension of the hidden representations.
44
+ intermediate_size (`int`, *optional*, defaults to 11008):
45
+ Dimension of the MLP representations.
46
+ num_hidden_layers (`int`, *optional*, defaults to 32):
47
+ Number of hidden layers in the Transformer encoder.
48
+ num_attention_heads (`int`, *optional*, defaults to 32):
49
+ Number of attention heads for each attention layer in the Transformer encoder.
50
+ num_key_value_heads (`int`, *optional*):
51
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
53
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
54
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
55
+ by meanpooling all the original heads within that group. For more details checkout [this
56
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
57
+ `num_attention_heads`.
58
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
+ The non-linear activation function (function or string) in the decoder.
60
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
61
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
62
+ just in case (e.g., 512 or 1024 or 2048).
63
+ initializer_range (`float`, *optional*, defaults to 0.02):
64
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
65
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
66
+ The epsilon used by the rms normalization layers.
67
+ use_cache (`bool`, *optional*, defaults to `True`):
68
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
69
+ relevant if `config.is_decoder=True`.
70
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
71
+ Whether to tie weight embeddings
72
+ Example:
73
+
74
+ """
75
+
76
+ model_type = "internlm2"
77
+ _auto_class = "AutoConfig"
78
+
79
+ def __init__( # pylint: disable=W0102
80
+ self,
81
+ vocab_size=103168,
82
+ hidden_size=4096,
83
+ intermediate_size=11008,
84
+ num_hidden_layers=32,
85
+ num_attention_heads=32,
86
+ num_key_value_heads=None,
87
+ hidden_act="silu",
88
+ max_position_embeddings=2048,
89
+ initializer_range=0.02,
90
+ rms_norm_eps=1e-6,
91
+ use_cache=True,
92
+ pad_token_id=0,
93
+ bos_token_id=1,
94
+ eos_token_id=2,
95
+ tie_word_embeddings=False,
96
+ bias=True,
97
+ rope_theta=10000,
98
+ rope_scaling=None,
99
+ attn_implementation="eager",
100
+ **kwargs,
101
+ ):
102
+ self.vocab_size = vocab_size
103
+ self.max_position_embeddings = max_position_embeddings
104
+ self.hidden_size = hidden_size
105
+ self.intermediate_size = intermediate_size
106
+ self.num_hidden_layers = num_hidden_layers
107
+ self.num_attention_heads = num_attention_heads
108
+ self.bias = bias
109
+
110
+ if num_key_value_heads is None:
111
+ num_key_value_heads = num_attention_heads
112
+ self.num_key_value_heads = num_key_value_heads
113
+
114
+ self.hidden_act = hidden_act
115
+ self.initializer_range = initializer_range
116
+ self.rms_norm_eps = rms_norm_eps
117
+ self.use_cache = use_cache
118
+ self.rope_theta = rope_theta
119
+ self.rope_scaling = rope_scaling
120
+ self._rope_scaling_validation()
121
+
122
+ self.attn_implementation = attn_implementation
123
+ if self.attn_implementation is None:
124
+ self.attn_implementation = "eager"
125
+ super().__init__(
126
+ pad_token_id=pad_token_id,
127
+ bos_token_id=bos_token_id,
128
+ eos_token_id=eos_token_id,
129
+ tie_word_embeddings=tie_word_embeddings,
130
+ **kwargs,
131
+ )
132
+
133
+ def _rope_scaling_validation(self):
134
+ """
135
+ Validate the `rope_scaling` configuration.
136
+ """
137
+ if self.rope_scaling is None:
138
+ return
139
+
140
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
141
+ raise ValueError(
142
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
143
+ f"got {self.rope_scaling}"
144
+ )
145
+ rope_scaling_type = self.rope_scaling.get("type", None)
146
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
147
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
148
+ raise ValueError(
149
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
150
+ )
151
+ if (
152
+ rope_scaling_factor is None
153
+ or not isinstance(rope_scaling_factor, float)
154
+ or rope_scaling_factor < 1.0
155
+ ):
156
+ raise ValueError(
157
+ f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}"
158
+ )
159
+
160
+
161
+ class InternVisionConfig(PretrainedConfig):
162
+ r"""
163
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
164
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
165
+
166
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
167
+ documentation from [`PretrainedConfig`] for more information.
168
+
169
+ Args:
170
+ num_channels (`int`, *optional*, defaults to 3):
171
+ Number of color channels in the input images (e.g., 3 for RGB).
172
+ patch_size (`int`, *optional*, defaults to 14):
173
+ The size (resolution) of each patch.
174
+ image_size (`int`, *optional*, defaults to 224):
175
+ The size (resolution) of each image.
176
+ qkv_bias (`bool`, *optional*, defaults to `False`):
177
+ Whether to add a bias to the queries and values in the self-attention layers.
178
+ hidden_size (`int`, *optional*, defaults to 3200):
179
+ Dimensionality of the encoder layers and the pooler layer.
180
+ num_attention_heads (`int`, *optional*, defaults to 25):
181
+ Number of attention heads for each attention layer in the Transformer encoder.
182
+ intermediate_size (`int`, *optional*, defaults to 12800):
183
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
184
+ qk_normalization (`bool`, *optional*, defaults to `True`):
185
+ Whether to normalize the queries and keys in the self-attention layers.
186
+ num_hidden_layers (`int`, *optional*, defaults to 48):
187
+ Number of hidden layers in the Transformer encoder.
188
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
189
+ Whether to use flash attention mechanism.
190
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
191
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
192
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
193
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
194
+ The epsilon used by the layer normalization layers.
195
+ dropout (`float`, *optional*, defaults to 0.0):
196
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
197
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
198
+ Dropout rate for stochastic depth.
199
+ attention_dropout (`float`, *optional*, defaults to 0.0):
200
+ The dropout ratio for the attention probabilities.
201
+ initializer_range (`float`, *optional*, defaults to 0.02):
202
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
203
+ initializer_factor (`float`, *optional*, defaults to 0.1):
204
+ A factor for layer scale.
205
+ """
206
+
207
+ model_type = "intern_vit_6b"
208
+
209
+ def __init__(
210
+ self,
211
+ num_channels=3,
212
+ patch_size=14,
213
+ image_size=224,
214
+ qkv_bias=False,
215
+ hidden_size=3200,
216
+ num_attention_heads=25,
217
+ intermediate_size=12800,
218
+ qk_normalization=True,
219
+ num_hidden_layers=48,
220
+ use_flash_attn=True,
221
+ hidden_act="gelu",
222
+ layer_norm_eps=1e-6,
223
+ dropout=0.0,
224
+ drop_path_rate=0.0,
225
+ attention_dropout=0.0,
226
+ initializer_range=0.02,
227
+ initializer_factor=0.1,
228
+ **kwargs,
229
+ ):
230
+ super().__init__(**kwargs)
231
+
232
+ self.hidden_size = hidden_size
233
+ self.intermediate_size = intermediate_size
234
+ self.dropout = dropout
235
+ self.drop_path_rate = drop_path_rate
236
+ self.num_hidden_layers = num_hidden_layers
237
+ self.num_attention_heads = num_attention_heads
238
+ self.num_channels = num_channels
239
+ self.patch_size = patch_size
240
+ self.image_size = image_size
241
+ self.initializer_range = initializer_range
242
+ self.initializer_factor = initializer_factor
243
+ self.attention_dropout = attention_dropout
244
+ self.layer_norm_eps = layer_norm_eps
245
+ self.hidden_act = hidden_act
246
+ self.qkv_bias = qkv_bias
247
+ self.qk_normalization = qk_normalization
248
+ self.use_flash_attn = use_flash_attn
249
+
250
+ @classmethod
251
+ def from_pretrained(
252
+ cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
253
+ ) -> "PretrainedConfig":
254
+ config_dict, kwargs = cls.get_config_dict(
255
+ pretrained_model_name_or_path, **kwargs
256
+ )
257
+
258
+ if "vision_config" in config_dict:
259
+ config_dict = config_dict["vision_config"]
260
+
261
+ if (
262
+ "model_type" in config_dict
263
+ and hasattr(cls, "model_type")
264
+ and config_dict["model_type"] != cls.model_type
265
+ ):
266
+ logger.warning(
267
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
268
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
269
+ )
270
+
271
+ return cls.from_dict(config_dict, **kwargs)
272
+
273
+
274
+ class InternVLChatConfig(PretrainedConfig):
275
+ model_type = "internvl_chat"
276
+ is_composition = True
277
+
278
+ def __init__(
279
+ self,
280
+ vision_config=None,
281
+ llm_config=None,
282
+ use_backbone_lora=0,
283
+ use_llm_lora=0,
284
+ pad2square=False,
285
+ select_layer=-1,
286
+ force_image_size=None,
287
+ downsample_ratio=0.5,
288
+ template=None,
289
+ dynamic_image_size=False,
290
+ use_thumbnail=False,
291
+ ps_version="v1",
292
+ min_dynamic_patch=1,
293
+ max_dynamic_patch=6,
294
+ **kwargs,
295
+ ):
296
+ super().__init__(**kwargs)
297
+
298
+ if vision_config is None:
299
+ vision_config = {"architectures": ["InternVisionModel"]}
300
+ logger.info(
301
+ "vision_config is None. Initializing the InternVisionConfig with default values."
302
+ )
303
+
304
+ if llm_config is None:
305
+ # TODO: There might still be a bug in transformers version 4.44 and above.
306
+ llm_config = {"architectures": [""]}
307
+ logger.info(
308
+ "llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
309
+ )
310
+ self.vision_config = InternVisionConfig(**vision_config)
311
+ if llm_config["architectures"][0] == "LlamaForCausalLM":
312
+ self.llm_config = LlamaConfig(**llm_config)
313
+ elif llm_config["architectures"][0] == "InternLM2ForCausalLM":
314
+ self.llm_config = InternLM2Config(**llm_config)
315
+ elif llm_config["architectures"][0] == "Phi3ForCausalLM":
316
+ self.llm_config = Phi3Config(**llm_config)
317
+ elif llm_config["architectures"][0] == "Qwen2ForCausalLM":
318
+ self.llm_config = Qwen2Config(**llm_config)
319
+ else:
320
+ raise ValueError(
321
+ "Unsupported architecture: {}".format(llm_config["architectures"][0])
322
+ )
323
+ self.use_backbone_lora = use_backbone_lora
324
+ self.use_llm_lora = use_llm_lora
325
+ self.pad2square = pad2square
326
+ self.select_layer = select_layer
327
+ self.force_image_size = force_image_size
328
+ self.downsample_ratio = downsample_ratio
329
+ self.template = template
330
+ self.dynamic_image_size = dynamic_image_size
331
+ self.use_thumbnail = use_thumbnail
332
+ self.ps_version = ps_version # pixel shuffle version
333
+ self.min_dynamic_patch = min_dynamic_patch
334
+ self.max_dynamic_patch = max_dynamic_patch
335
+
336
+ self.hidden_size = self.llm_config.hidden_size
337
+ # By default, we use tie_word_embeddings=False for models of all sizes.
338
+ self.tie_word_embeddings = False
339
+ self.llm_config.tie_word_embeddings = self.tie_word_embeddings
340
+
341
+ def to_dict(self):
342
+ """
343
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
344
+
345
+ Returns:
346
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
347
+ """
348
+ output = copy.deepcopy(self.__dict__)
349
+ output["vision_config"] = self.vision_config.to_dict()
350
+ output["llm_config"] = self.llm_config.to_dict()
351
+ output["model_type"] = self.__class__.model_type
352
+ output["use_backbone_lora"] = self.use_backbone_lora
353
+ output["use_llm_lora"] = self.use_llm_lora
354
+ output["select_layer"] = self.select_layer
355
+ output["force_image_size"] = self.force_image_size
356
+ output["downsample_ratio"] = self.downsample_ratio
357
+ output["template"] = self.template
358
+ output["dynamic_image_size"] = self.dynamic_image_size
359
+ output["use_thumbnail"] = self.use_thumbnail
360
+ output["ps_version"] = self.ps_version
361
+ output["min_dynamic_patch"] = self.min_dynamic_patch
362
+ output["max_dynamic_patch"] = self.max_dynamic_patch
363
+
364
+ return output
365
+
366
+
367
+ # # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
368
+ # class InternLM2TokenizerFast(PreTrainedTokenizerFast):
369
+ # vocab_files_names = VOCAB_FILES_NAMES
370
+ # slow_tokenizer_class = InternLM2Tokenizer
371
+ # padding_side = 'left'
372
+ # model_input_names = ['input_ids', 'attention_mask']
373
+ # _auto_class = 'AutoTokenizer'
374
+ #
375
+ # def __init__(
376
+ # self,
377
+ # vocab_file,
378
+ # unk_token='<unk>',
379
+ # bos_token='<s>',
380
+ # eos_token='</s>',
381
+ # pad_token='</s>',
382
+ # sp_model_kwargs: Optional[Dict[str, Any]] = None,
383
+ # add_bos_token=True,
384
+ # add_eos_token=False,
385
+ # decode_with_prefix_space=False,
386
+ # clean_up_tokenization_spaces=False,
387
+ # **kwargs,
388
+ # ):
389
+ # super().__init__(
390
+ # vocab_file=vocab_file,
391
+ # unk_token=unk_token,
392
+ # bos_token=bos_token,
393
+ # eos_token=eos_token,
394
+ # pad_token=pad_token,
395
+ # sp_model_kwargs=sp_model_kwargs,
396
+ # add_bos_token=add_bos_token,
397
+ # add_eos_token=add_eos_token,
398
+ # decode_with_prefix_space=decode_with_prefix_space,
399
+ # clean_up_tokenization_spaces=clean_up_tokenization_spaces,
400
+ # **kwargs,
401
+ # )
402
+ # self._add_bos_token = add_bos_token
403
+ # self._add_eos_token = add_eos_token
404
+ # self.update_post_processor()
405
+ # self.vocab_file = vocab_file
406
+ #
407
+ # @property
408
+ # def can_save_slow_tokenizer(self) -> bool:
409
+ # return os.path.isfile(self.vocab_file) if self.vocab_file else False
410
+ #
411
+ # def update_post_processor(self):
412
+ # """
413
+ # Updates the underlying post processor with the current `bos_token` and `eos_token`.
414
+ # """
415
+ # bos = self.bos_token
416
+ # bos_token_id = self.bos_token_id
417
+ # if bos is None and self.add_bos_token:
418
+ # raise ValueError('add_bos_token = True but bos_token = None')
419
+ #
420
+ # eos = self.eos_token
421
+ # eos_token_id = self.eos_token_id
422
+ # if eos is None and self.add_eos_token:
423
+ # raise ValueError('add_eos_token = True but eos_token = None')
424
+ #
425
+ # single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
426
+ # pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
427
+ #
428
+ # special_tokens = []
429
+ # if self.add_bos_token:
430
+ # special_tokens.append((bos, bos_token_id))
431
+ # if self.add_eos_token:
432
+ # special_tokens.append((eos, eos_token_id))
433
+ # self._tokenizer.post_processor = processors.TemplateProcessing(
434
+ # single=single, pair=pair, special_tokens=special_tokens
435
+ # )
436
+ #
437
+ # @property
438
+ # def add_eos_token(self):
439
+ # return self._add_eos_token
440
+ #
441
+ # @property
442
+ # def add_bos_token(self):
443
+ # return self._add_bos_token
444
+ #
445
+ # @add_eos_token.setter
446
+ # def add_eos_token(self, value):
447
+ # self._add_eos_token = value
448
+ # self.update_post_processor()
449
+ #
450
+ # @add_bos_token.setter
451
+ # def add_bos_token(self, value):
452
+ # self._add_bos_token = value
453
+ # self.update_post_processor()
454
+ #
455
+ # def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
456
+ # if not self.can_save_slow_tokenizer:
457
+ # raise ValueError(
458
+ # 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
459
+ # 'tokenizer.'
460
+ # )
461
+ #
462
+ # if not os.path.isdir(save_directory):
463
+ # logger.error(f'Vocabulary path ({save_directory}) should be a directory')
464
+ # return
465
+ # out_vocab_file = os.path.join(
466
+ # save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
467
+ # )
468
+ #
469
+ # if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
470
+ # copyfile(self.vocab_file, out_vocab_file)
471
+ #
472
+ # return (out_vocab_file,)
473
+
474
+
475
+ # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
476
+ class InternLM2Tokenizer(PreTrainedTokenizer):
477
+ """
478
+ Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
479
+
480
+ Args:
481
+ vocab_file (`str`):
482
+ Path to the vocabulary file.
483
+ """
484
+
485
+ vocab_files_names = VOCAB_FILES_NAMES
486
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
487
+ model_input_names = ["input_ids", "attention_mask"]
488
+ _auto_class = "AutoTokenizer"
489
+
490
+ def __init__(
491
+ self,
492
+ vocab_file,
493
+ unk_token="<unk>",
494
+ bos_token="<s>",
495
+ eos_token="</s>",
496
+ pad_token="</s>",
497
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
498
+ add_bos_token=True,
499
+ add_eos_token=False,
500
+ decode_with_prefix_space=False,
501
+ clean_up_tokenization_spaces=False,
502
+ **kwargs,
503
+ ):
504
+ print("register succeed")
505
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
506
+ self.vocab_file = vocab_file
507
+ self.add_bos_token = add_bos_token
508
+ self.add_eos_token = add_eos_token
509
+ self.decode_with_prefix_space = decode_with_prefix_space
510
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
511
+ self.sp_model.Load(vocab_file)
512
+ self._no_prefix_space_tokens = None
513
+ super().__init__(
514
+ bos_token=bos_token,
515
+ eos_token=eos_token,
516
+ unk_token=unk_token,
517
+ pad_token=pad_token,
518
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
519
+ **kwargs,
520
+ )
521
+
522
+ @property
523
+ def no_prefix_space_tokens(self):
524
+ if self._no_prefix_space_tokens is None:
525
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
526
+ self._no_prefix_space_tokens = {
527
+ i for i, tok in enumerate(vocab) if not tok.startswith("▁")
528
+ }
529
+ return self._no_prefix_space_tokens
530
+
531
+ @property
532
+ def vocab_size(self):
533
+ """Returns vocab size"""
534
+ return self.sp_model.get_piece_size()
535
+
536
+ @property
537
+ def bos_token_id(self) -> Optional[int]:
538
+ return self.sp_model.bos_id()
539
+
540
+ @property
541
+ def eos_token_id(self) -> Optional[int]:
542
+ return self.sp_model.eos_id()
543
+
544
+ def get_vocab(self):
545
+ """Returns vocab as a dict"""
546
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
547
+ vocab.update(self.added_tokens_encoder)
548
+ return vocab
549
+
550
+ def _tokenize(self, text):
551
+ """Returns a tokenized string."""
552
+ return self.sp_model.encode(text, out_type=str)
553
+
554
+ def _convert_token_to_id(self, token):
555
+ """Converts a token (str) in an id using the vocab."""
556
+ return self.sp_model.piece_to_id(token)
557
+
558
+ def _convert_id_to_token(self, index):
559
+ """Converts an index (integer) in a token (str) using the vocab."""
560
+ token = self.sp_model.IdToPiece(index)
561
+ return token
562
+
563
+ def _maybe_add_prefix_space(self, tokens, decoded):
564
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
565
+ return " " + decoded
566
+ else:
567
+ return decoded
568
+
569
+ def convert_tokens_to_string(self, tokens):
570
+ """Converts a sequence of tokens (string) in a single string."""
571
+ current_sub_tokens = []
572
+ out_string = ""
573
+ prev_is_special = False
574
+ for token in tokens:
575
+ # make sure that special tokens are not decoded using sentencepiece model
576
+ if token in self.all_special_tokens:
577
+ if not prev_is_special:
578
+ out_string += " "
579
+ out_string += self.sp_model.decode(current_sub_tokens) + token
580
+ prev_is_special = True
581
+ current_sub_tokens = []
582
+ else:
583
+ current_sub_tokens.append(token)
584
+ prev_is_special = False
585
+ out_string += self.sp_model.decode(current_sub_tokens)
586
+ out_string = self.clean_up_tokenization(out_string)
587
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
588
+ return out_string[1:]
589
+
590
+ def save_vocabulary(
591
+ self, save_directory, filename_prefix: Optional[str] = None
592
+ ) -> Tuple[str]:
593
+ """
594
+ Save the vocabulary and special tokens file to a directory.
595
+
596
+ Args:
597
+ save_directory (`str`):
598
+ The directory in which to save the vocabulary.
599
+
600
+ Returns:
601
+ `Tuple(str)`: Paths to the files saved.
602
+ """
603
+ if not os.path.isdir(save_directory):
604
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
605
+ return
606
+ out_vocab_file = os.path.join(
607
+ save_directory,
608
+ (filename_prefix + "-" if filename_prefix else "")
609
+ + VOCAB_FILES_NAMES["vocab_file"],
610
+ )
611
+
612
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
613
+ out_vocab_file
614
+ ) and os.path.isfile(self.vocab_file):
615
+ copyfile(self.vocab_file, out_vocab_file)
616
+ elif not os.path.isfile(self.vocab_file):
617
+ with open(out_vocab_file, "wb") as fi:
618
+ content_spiece_model = self.sp_model.serialized_model_proto()
619
+ fi.write(content_spiece_model)
620
+
621
+ return (out_vocab_file,)
622
+
623
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
624
+ if self.add_bos_token:
625
+ bos_token_ids = [self.bos_token_id]
626
+ else:
627
+ bos_token_ids = []
628
+
629
+ output = bos_token_ids + token_ids_0
630
+
631
+ if token_ids_1 is not None:
632
+ output = output + token_ids_1
633
+
634
+ if self.add_eos_token:
635
+ output = output + [self.eos_token_id]
636
+
637
+ return output
638
+
639
+ def get_special_tokens_mask(
640
+ self,
641
+ token_ids_0: List[int],
642
+ token_ids_1: Optional[List[int]] = None,
643
+ already_has_special_tokens: bool = False,
644
+ ) -> List[int]:
645
+ """
646
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
647
+ special tokens using the tokenizer `prepare_for_model` method.
648
+
649
+ Args:
650
+ token_ids_0 (`List[int]`):
651
+ List of IDs.
652
+ token_ids_1 (`List[int]`, *optional*):
653
+ Optional second list of IDs for sequence pairs.
654
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
655
+ Whether or not the token list is already formatted with special tokens for the model.
656
+
657
+ Returns:
658
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
659
+ """
660
+ if already_has_special_tokens:
661
+ return super().get_special_tokens_mask(
662
+ token_ids_0=token_ids_0,
663
+ token_ids_1=token_ids_1,
664
+ already_has_special_tokens=True,
665
+ )
666
+
667
+ if token_ids_1 is None:
668
+ return [1] + ([0] * len(token_ids_0)) + [1]
669
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
670
+
671
+ def create_token_type_ids_from_sequences(
672
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
673
+ ) -> List[int]:
674
+ """
675
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
676
+ use of token type ids, therefore a list of zeros is returned.
677
+
678
+ Args:
679
+ token_ids_0 (`List[int]`):
680
+ List of IDs.
681
+ token_ids_1 (`List[int]`, *optional*):
682
+ Optional second list of IDs for sequence pairs.
683
+
684
+ Returns:
685
+ `List[int]`: List of zeros.
686
+ """
687
+ eos = [self.eos_token_id]
688
+
689
+ if token_ids_1 is None:
690
+ return len(token_ids_0 + eos) * [0]
691
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
692
+
693
+
694
+ TOKENIZER_MAPPING.register(
695
+ InternVLChatConfig, (InternLM2Tokenizer, None), exist_ok=True
696
+ )