sglang 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,517 @@
1
+ # Adapted from:
2
+ # https://github.com/vllm-project/vllm/blob/fb6af8bc086328ca6659e72d11ffd4309ce4de22/vllm/model_executor/models/deepseek_v2.py
3
+ """Inference-only DeepseekV2 model."""
4
+ from typing import Any, Dict, Iterable, Optional, Tuple
5
+
6
+ import torch
7
+ from torch import nn
8
+ from transformers import PretrainedConfig
9
+ from vllm.config import CacheConfig
10
+ from vllm.distributed import (
11
+ get_tensor_model_parallel_world_size,
12
+ tensor_model_parallel_all_reduce,
13
+ )
14
+ from vllm.model_executor.layers.activation import SiluAndMul
15
+ from vllm.model_executor.layers.fused_moe import FusedMoE
16
+ from vllm.model_executor.layers.layernorm import RMSNorm
17
+ from vllm.model_executor.layers.linear import (
18
+ ColumnParallelLinear,
19
+ MergedColumnParallelLinear,
20
+ ReplicatedLinear,
21
+ RowParallelLinear,
22
+ )
23
+ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
24
+ from vllm.model_executor.layers.rotary_embedding import get_rope
25
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
26
+ ParallelLMHead,
27
+ VocabParallelEmbedding,
28
+ )
29
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
30
+
31
+ from sglang.srt.layers.logits_processor import LogitsProcessor
32
+ from sglang.srt.layers.radix_attention import RadixAttention
33
+ from sglang.srt.managers.controller.model_runner import InputMetadata
34
+
35
+
36
+ class DeepseekV2MLP(nn.Module):
37
+ def __init__(
38
+ self,
39
+ hidden_size: int,
40
+ intermediate_size: int,
41
+ hidden_act: str,
42
+ quant_config: Optional[QuantizationConfig] = None,
43
+ reduce_results: bool = True,
44
+ ) -> None:
45
+ super().__init__()
46
+ self.gate_up_proj = MergedColumnParallelLinear(
47
+ hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
48
+ )
49
+ self.down_proj = RowParallelLinear(
50
+ intermediate_size,
51
+ hidden_size,
52
+ bias=False,
53
+ quant_config=quant_config,
54
+ reduce_results=reduce_results,
55
+ )
56
+ if hidden_act != "silu":
57
+ raise ValueError(
58
+ f"Unsupported activation: {hidden_act}. "
59
+ "Only silu is supported for now."
60
+ )
61
+ self.act_fn = SiluAndMul()
62
+
63
+ def forward(self, x):
64
+ gate_up, _ = self.gate_up_proj(x)
65
+ x = self.act_fn(gate_up)
66
+ x, _ = self.down_proj(x)
67
+ return x
68
+
69
+
70
+ class DeepseekV2MoE(nn.Module):
71
+
72
+ def __init__(
73
+ self,
74
+ config: PretrainedConfig,
75
+ quant_config: Optional[QuantizationConfig] = None,
76
+ ):
77
+ super().__init__()
78
+ self.tp_size = get_tensor_model_parallel_world_size()
79
+ self.routed_scaling_factor = config.routed_scaling_factor
80
+ self.n_shared_experts = config.n_shared_experts
81
+ self.routed_scaling_factor = config.routed_scaling_factor
82
+ if self.tp_size > config.n_routed_experts:
83
+ raise ValueError(
84
+ f"Tensor parallel size {self.tp_size} is greater than "
85
+ f"the number of experts {config.n_routed_experts}."
86
+ )
87
+
88
+ if config.hidden_act != "silu":
89
+ raise ValueError(
90
+ f"Unsupported activation: {config.hidden_act}. "
91
+ "Only silu is supported for now."
92
+ )
93
+
94
+ self.experts = FusedMoE(
95
+ num_experts=config.n_routed_experts,
96
+ top_k=config.num_experts_per_tok,
97
+ hidden_size=config.hidden_size,
98
+ intermediate_size=config.moe_intermediate_size,
99
+ reduce_results=False,
100
+ renormalize=config.norm_topk_prob,
101
+ quant_config=quant_config,
102
+ use_grouped_topk=True,
103
+ num_expert_group=config.n_group,
104
+ topk_group=config.topk_group,
105
+ )
106
+
107
+ self.gate = ReplicatedLinear(
108
+ config.hidden_size, config.n_routed_experts, bias=False, quant_config=None
109
+ )
110
+ if config.n_shared_experts is not None:
111
+ intermediate_size = config.moe_intermediate_size * config.n_shared_experts
112
+ self.shared_experts = DeepseekV2MLP(
113
+ hidden_size=config.hidden_size,
114
+ intermediate_size=intermediate_size,
115
+ hidden_act=config.hidden_act,
116
+ quant_config=quant_config,
117
+ reduce_results=False,
118
+ )
119
+
120
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
121
+ num_tokens, hidden_dim = hidden_states.shape
122
+ hidden_states = hidden_states.view(-1, hidden_dim)
123
+ if self.n_shared_experts is not None:
124
+ shared_output = self.shared_experts(hidden_states)
125
+ # router_logits: (num_tokens, n_experts)
126
+ router_logits, _ = self.gate(hidden_states)
127
+ final_hidden_states = (
128
+ self.experts(hidden_states=hidden_states, router_logits=router_logits)
129
+ * self.routed_scaling_factor
130
+ )
131
+ if shared_output is not None:
132
+ final_hidden_states = final_hidden_states + shared_output
133
+ if self.tp_size > 1:
134
+ final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
135
+
136
+ return final_hidden_states.view(num_tokens, hidden_dim)
137
+
138
+
139
+ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
140
+ import math
141
+
142
+ if scale <= 1:
143
+ return 1.0
144
+ return 0.1 * mscale * math.log(scale) + 1.0
145
+
146
+
147
+ class DeepseekV2Attention(nn.Module):
148
+
149
+ def __init__(
150
+ self,
151
+ config: PretrainedConfig,
152
+ hidden_size: int,
153
+ num_heads: int,
154
+ qk_nope_head_dim: int,
155
+ qk_rope_head_dim: int,
156
+ v_head_dim: int,
157
+ q_lora_rank: int,
158
+ kv_lora_rank: int,
159
+ rope_theta: float = 10000,
160
+ rope_scaling: Optional[Dict[str, Any]] = None,
161
+ max_position_embeddings: int = 8192,
162
+ cache_config: Optional[CacheConfig] = None,
163
+ quant_config: Optional[QuantizationConfig] = None,
164
+ layer_id=None,
165
+ ) -> None:
166
+ super().__init__()
167
+ self.layer_id = layer_id
168
+ self.hidden_size = hidden_size
169
+ self.qk_nope_head_dim = qk_nope_head_dim
170
+ self.qk_rope_head_dim = qk_rope_head_dim
171
+ self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
172
+ self.v_head_dim = v_head_dim
173
+ self.q_lora_rank = q_lora_rank
174
+ self.kv_lora_rank = kv_lora_rank
175
+ self.num_heads = num_heads
176
+ tp_size = get_tensor_model_parallel_world_size()
177
+ assert num_heads % tp_size == 0
178
+ self.num_local_heads = num_heads // tp_size
179
+ self.scaling = self.qk_head_dim**-0.5
180
+ self.rope_theta = rope_theta
181
+ self.max_position_embeddings = max_position_embeddings
182
+
183
+ if self.q_lora_rank is not None:
184
+ self.q_a_proj = ReplicatedLinear(
185
+ self.hidden_size,
186
+ self.q_lora_rank,
187
+ bias=False,
188
+ quant_config=quant_config,
189
+ )
190
+ self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
191
+ self.q_b_proj = ColumnParallelLinear(
192
+ q_lora_rank,
193
+ self.num_heads * self.qk_head_dim,
194
+ bias=False,
195
+ quant_config=quant_config,
196
+ )
197
+ else:
198
+ self.q_proj = ColumnParallelLinear(
199
+ self.hidden_size,
200
+ self.num_heads * self.qk_head_dim,
201
+ bias=False,
202
+ quant_config=quant_config,
203
+ )
204
+
205
+ self.kv_a_proj_with_mqa = ReplicatedLinear(
206
+ self.hidden_size,
207
+ self.kv_lora_rank + self.qk_rope_head_dim,
208
+ bias=False,
209
+ quant_config=quant_config,
210
+ )
211
+ self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
212
+ self.kv_b_proj = ColumnParallelLinear(
213
+ self.kv_lora_rank,
214
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
215
+ bias=False,
216
+ quant_config=quant_config,
217
+ )
218
+ # O projection.
219
+ self.o_proj = RowParallelLinear(
220
+ self.num_heads * self.v_head_dim,
221
+ self.hidden_size,
222
+ bias=False,
223
+ quant_config=quant_config,
224
+ )
225
+ rope_scaling["type"] = "deepseek_yarn"
226
+ self.rotary_emb = get_rope(
227
+ qk_rope_head_dim,
228
+ rotary_dim=qk_rope_head_dim,
229
+ max_position=max_position_embeddings,
230
+ base=rope_theta,
231
+ rope_scaling=rope_scaling,
232
+ is_neox_style=False,
233
+ )
234
+
235
+ if rope_scaling:
236
+ mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
237
+ scaling_factor = rope_scaling["factor"]
238
+ mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
239
+ self.scaling = self.scaling * mscale * mscale
240
+
241
+ # self.attn = Attention(self.num_heads,
242
+ # self.qk_head_dim,
243
+ # self.scaling,
244
+ # num_kv_heads=self.num_heads)
245
+
246
+ # TODO, support head_size 192
247
+ self.attn = RadixAttention(
248
+ self.num_local_heads,
249
+ 256,
250
+ self.scaling,
251
+ num_kv_heads=self.num_local_heads,
252
+ layer_id=layer_id,
253
+ )
254
+
255
+ def forward(
256
+ self,
257
+ positions: torch.Tensor,
258
+ hidden_states: torch.Tensor,
259
+ input_metadata: InputMetadata,
260
+ ) -> torch.Tensor:
261
+ if self.q_lora_rank is not None:
262
+ q = self.q_a_proj(hidden_states)[0]
263
+ q = self.q_a_layernorm(q)
264
+ q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
265
+ else:
266
+ q = self.q_proj(hidden_states)[0].view(
267
+ -1, self.num_local_heads, self.qk_head_dim
268
+ )
269
+ q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
270
+ latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
271
+ kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
272
+ latent_cache = latent_cache.unsqueeze(1)
273
+ kv_a = self.kv_a_layernorm(kv_a.contiguous())
274
+ kv = self.kv_b_proj(kv_a)[0]
275
+ kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim)
276
+ k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
277
+ k_pe = latent_cache[:, :, self.kv_lora_rank :]
278
+ q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
279
+ q[..., self.qk_nope_head_dim :] = q_pe
280
+ k = torch.empty_like(q)
281
+ k[..., : self.qk_nope_head_dim] = k_nope
282
+ k[..., self.qk_nope_head_dim :] = k_pe
283
+ q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], value=0).view(
284
+ -1, self.num_local_heads * 256
285
+ )
286
+ k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], value=0).view(
287
+ -1, self.num_local_heads * 256
288
+ )
289
+ v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], value=0).view(
290
+ -1, self.num_local_heads * 256
291
+ )
292
+ attn_output = self.attn(q, k, v, input_metadata)
293
+ attn_output = attn_output.view(-1, self.num_local_heads, 256)[
294
+ ..., : self.v_head_dim
295
+ ].reshape(-1, self.num_local_heads * self.v_head_dim)
296
+ output, _ = self.o_proj(attn_output)
297
+ return output
298
+
299
+
300
+ class DeepseekV2DecoderLayer(nn.Module):
301
+
302
+ def __init__(
303
+ self,
304
+ config: PretrainedConfig,
305
+ layer_id: int,
306
+ cache_config: Optional[CacheConfig] = None,
307
+ quant_config: Optional[QuantizationConfig] = None,
308
+ ) -> None:
309
+ super().__init__()
310
+ self.hidden_size = config.hidden_size
311
+ rope_theta = getattr(config, "rope_theta", 10000)
312
+ rope_scaling = getattr(config, "rope_scaling", None)
313
+ max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
314
+ self.self_attn = DeepseekV2Attention(
315
+ config=config,
316
+ hidden_size=self.hidden_size,
317
+ num_heads=config.num_attention_heads,
318
+ qk_nope_head_dim=config.qk_nope_head_dim,
319
+ qk_rope_head_dim=config.qk_rope_head_dim,
320
+ v_head_dim=config.v_head_dim,
321
+ q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
322
+ kv_lora_rank=config.kv_lora_rank,
323
+ rope_theta=rope_theta,
324
+ rope_scaling=rope_scaling,
325
+ max_position_embeddings=max_position_embeddings,
326
+ cache_config=cache_config,
327
+ quant_config=quant_config,
328
+ layer_id=layer_id,
329
+ )
330
+ if (
331
+ config.n_routed_experts is not None
332
+ and layer_id >= config.first_k_dense_replace
333
+ and layer_id % config.moe_layer_freq == 0
334
+ ):
335
+ self.mlp = DeepseekV2MoE(config=config, quant_config=quant_config)
336
+ else:
337
+ self.mlp = DeepseekV2MLP(
338
+ hidden_size=config.hidden_size,
339
+ intermediate_size=config.intermediate_size,
340
+ hidden_act=config.hidden_act,
341
+ quant_config=quant_config,
342
+ )
343
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
344
+ self.post_attention_layernorm = RMSNorm(
345
+ config.hidden_size, eps=config.rms_norm_eps
346
+ )
347
+
348
+ def forward(
349
+ self,
350
+ positions: torch.Tensor,
351
+ hidden_states: torch.Tensor,
352
+ input_metadata: InputMetadata,
353
+ residual: Optional[torch.Tensor],
354
+ ) -> torch.Tensor:
355
+ # Self Attention
356
+ if residual is None:
357
+ residual = hidden_states
358
+ hidden_states = self.input_layernorm(hidden_states)
359
+ else:
360
+ hidden_states, residual = self.input_layernorm(hidden_states, residual)
361
+ hidden_states = self.self_attn(
362
+ positions=positions,
363
+ hidden_states=hidden_states,
364
+ input_metadata=input_metadata,
365
+ )
366
+
367
+ # Fully Connected
368
+ hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
369
+ hidden_states = self.mlp(hidden_states)
370
+ return hidden_states, residual
371
+
372
+
373
+ class DeepseekV2Model(nn.Module):
374
+
375
+ fall_back_to_pt_during_load = False
376
+
377
+ def __init__(
378
+ self,
379
+ config: PretrainedConfig,
380
+ cache_config: Optional[CacheConfig] = None,
381
+ quant_config: Optional[QuantizationConfig] = None,
382
+ ) -> None:
383
+ super().__init__()
384
+ self.padding_id = config.pad_token_id
385
+ self.vocab_size = config.vocab_size
386
+
387
+ self.embed_tokens = VocabParallelEmbedding(
388
+ config.vocab_size,
389
+ config.hidden_size,
390
+ )
391
+ self.layers = nn.ModuleList(
392
+ [
393
+ DeepseekV2DecoderLayer(
394
+ config,
395
+ layer_id,
396
+ cache_config=cache_config,
397
+ quant_config=quant_config,
398
+ )
399
+ for layer_id in range(config.num_hidden_layers)
400
+ ]
401
+ )
402
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
403
+
404
+ def forward(
405
+ self,
406
+ input_ids: torch.Tensor,
407
+ positions: torch.Tensor,
408
+ input_metadata: InputMetadata,
409
+ ) -> torch.Tensor:
410
+ hidden_states = self.embed_tokens(input_ids)
411
+ residual = None
412
+ for i in range(len(self.layers)):
413
+ layer = self.layers[i]
414
+ hidden_states, residual = layer(
415
+ positions, hidden_states, input_metadata, residual
416
+ )
417
+ hidden_states, _ = self.norm(hidden_states, residual)
418
+ return hidden_states
419
+
420
+
421
+ class DeepseekV2ForCausalLM(nn.Module):
422
+
423
+ def __init__(
424
+ self,
425
+ config: PretrainedConfig,
426
+ cache_config: Optional[CacheConfig] = None,
427
+ quant_config: Optional[QuantizationConfig] = None,
428
+ ) -> None:
429
+ super().__init__()
430
+ self.config = config
431
+ self.quant_config = quant_config
432
+ self.model = DeepseekV2Model(config, cache_config, quant_config)
433
+ self.lm_head = ParallelLMHead(
434
+ config.vocab_size, config.hidden_size, quant_config=quant_config
435
+ )
436
+ self.logits_processor = LogitsProcessor(config)
437
+
438
+ def forward(
439
+ self,
440
+ input_ids: torch.Tensor,
441
+ positions: torch.Tensor,
442
+ input_metadata: InputMetadata,
443
+ ) -> torch.Tensor:
444
+ hidden_states = self.model(input_ids, positions, input_metadata)
445
+ return self.logits_processor(
446
+ input_ids, hidden_states, self.lm_head.weight, input_metadata
447
+ )
448
+
449
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
450
+ stacked_params_mapping = [
451
+ # (param_name, shard_name, shard_id)
452
+ ("gate_up_proj", "gate_proj", 0),
453
+ ("gate_up_proj", "up_proj", 1),
454
+ ]
455
+
456
+ # Params for weights, fp8 weight scales, fp8 activation scales
457
+ # (param_name, weight_name, expert_id, shard_id)
458
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
459
+ ckpt_gate_proj_name="gate_proj",
460
+ ckpt_down_proj_name="down_proj",
461
+ ckpt_up_proj_name="up_proj",
462
+ num_experts=self.config.n_routed_experts,
463
+ )
464
+
465
+ params_dict = dict(self.named_parameters())
466
+ for name, loaded_weight in weights:
467
+ if "rotary_emb.inv_freq" in name:
468
+ continue
469
+ for param_name, weight_name, shard_id in stacked_params_mapping:
470
+ # Skip non-stacked layers and experts (experts handled below).
471
+ if weight_name not in name:
472
+ continue
473
+ # We have mlp.experts[0].gate_proj in the checkpoint.
474
+ # Since we handle the experts below in expert_params_mapping,
475
+ # we need to skip here BEFORE we update the name, otherwise
476
+ # name will be updated to mlp.experts[0].gate_up_proj, which
477
+ # will then be updated below in expert_params_mapping
478
+ # for mlp.experts[0].gate_gate_up_proj, which breaks load.
479
+ if ("mlp.experts." in name) and name not in params_dict:
480
+ continue
481
+ name = name.replace(weight_name, param_name)
482
+ # Skip loading extra bias for GPTQ models.
483
+ if name.endswith(".bias") and name not in params_dict:
484
+ continue
485
+ param = params_dict[name]
486
+ weight_loader = param.weight_loader
487
+ weight_loader(param, loaded_weight, shard_id)
488
+ break
489
+ else:
490
+ for mapping in expert_params_mapping:
491
+ param_name, weight_name, expert_id, shard_id = mapping
492
+ if weight_name not in name:
493
+ continue
494
+ name = name.replace(weight_name, param_name)
495
+ param = params_dict[name]
496
+ weight_loader = param.weight_loader
497
+ weight_loader(
498
+ param,
499
+ loaded_weight,
500
+ weight_name,
501
+ shard_id=shard_id,
502
+ expert_id=expert_id,
503
+ )
504
+ break
505
+ else:
506
+ # Skip loading extra bias for GPTQ models.
507
+ if name.endswith(".bias") and name not in params_dict:
508
+ continue
509
+
510
+ param = params_dict[name]
511
+ weight_loader = getattr(
512
+ param, "weight_loader", default_weight_loader
513
+ )
514
+ weight_loader(param, loaded_weight)
515
+
516
+
517
+ EntryClass = DeepseekV2ForCausalLM
@@ -54,9 +54,9 @@ class LlamaForClassification(nn.Module):
54
54
  next_token_logits=scores,
55
55
  next_token_logprobs=scores,
56
56
  normalized_prompt_logprobs=scores,
57
- prefill_token_logprobs=torch.ones_like(input_ids),
58
- prefill_top_logprobs=None,
59
- decode_top_logprobs=None,
57
+ input_token_logprobs=torch.ones_like(input_ids),
58
+ input_top_logprobs=None,
59
+ output_top_logprobs=None,
60
60
  )
61
61
 
62
62
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
@@ -140,29 +140,29 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
140
140
  if request.logprobs:
141
141
  # The first chunk and echo is enabled.
142
142
  if not stream_buffer and request.echo:
143
- prefill_token_logprobs = content["meta_info"][
144
- "prefill_token_logprobs"
143
+ input_token_logprobs = content["meta_info"][
144
+ "input_token_logprobs"
145
145
  ]
146
- prefill_top_logprobs = content["meta_info"][
147
- "prefill_top_logprobs"
146
+ input_top_logprobs = content["meta_info"][
147
+ "input_top_logprobs"
148
148
  ]
149
149
  else:
150
- prefill_token_logprobs = None
151
- prefill_top_logprobs = None
150
+ input_token_logprobs = None
151
+ input_top_logprobs = None
152
152
 
153
153
  logprobs = to_openai_style_logprobs(
154
- prefill_token_logprobs=prefill_token_logprobs,
155
- prefill_top_logprobs=prefill_top_logprobs,
156
- decode_token_logprobs=content["meta_info"][
157
- "decode_token_logprobs"
154
+ input_token_logprobs=input_token_logprobs,
155
+ input_top_logprobs=input_top_logprobs,
156
+ output_token_logprobs=content["meta_info"][
157
+ "output_token_logprobs"
158
158
  ][n_prev_token:],
159
- decode_top_logprobs=content["meta_info"][
160
- "decode_top_logprobs"
159
+ output_top_logprobs=content["meta_info"][
160
+ "output_top_logprobs"
161
161
  ][n_prev_token:],
162
162
  )
163
163
 
164
164
  n_prev_token = len(
165
- content["meta_info"]["decode_token_logprobs"]
165
+ content["meta_info"]["output_token_logprobs"]
166
166
  )
167
167
  else:
168
168
  logprobs = None
@@ -218,17 +218,17 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
218
218
 
219
219
  if request.logprobs:
220
220
  if request.echo:
221
- prefill_token_logprobs = ret_item["meta_info"]["prefill_token_logprobs"]
222
- prefill_top_logprobs = ret_item["meta_info"]["prefill_top_logprobs"]
221
+ input_token_logprobs = ret_item["meta_info"]["input_token_logprobs"]
222
+ input_top_logprobs = ret_item["meta_info"]["input_top_logprobs"]
223
223
  else:
224
- prefill_token_logprobs = None
225
- prefill_top_logprobs = None
224
+ input_token_logprobs = None
225
+ input_top_logprobs = None
226
226
 
227
227
  logprobs = to_openai_style_logprobs(
228
- prefill_token_logprobs=prefill_token_logprobs,
229
- prefill_top_logprobs=prefill_top_logprobs,
230
- decode_token_logprobs=ret_item["meta_info"]["decode_token_logprobs"],
231
- decode_top_logprobs=ret_item["meta_info"]["decode_top_logprobs"],
228
+ input_token_logprobs=input_token_logprobs,
229
+ input_top_logprobs=input_top_logprobs,
230
+ output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
231
+ output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
232
232
  )
233
233
  else:
234
234
  logprobs = None
@@ -401,10 +401,10 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
401
401
 
402
402
 
403
403
  def to_openai_style_logprobs(
404
- prefill_token_logprobs=None,
405
- decode_token_logprobs=None,
406
- prefill_top_logprobs=None,
407
- decode_top_logprobs=None,
404
+ input_token_logprobs=None,
405
+ output_token_logprobs=None,
406
+ input_top_logprobs=None,
407
+ output_top_logprobs=None,
408
408
  ):
409
409
  ret_logprobs = LogProbs()
410
410
 
@@ -425,13 +425,13 @@ def to_openai_style_logprobs(
425
425
  else:
426
426
  ret_logprobs.top_logprobs.append(None)
427
427
 
428
- if prefill_token_logprobs is not None:
429
- append_token_logprobs(prefill_token_logprobs)
430
- if decode_token_logprobs is not None:
431
- append_token_logprobs(decode_token_logprobs)
432
- if prefill_top_logprobs is not None:
433
- append_top_logprobs(prefill_top_logprobs)
434
- if decode_top_logprobs is not None:
435
- append_top_logprobs(decode_top_logprobs)
428
+ if input_token_logprobs is not None:
429
+ append_token_logprobs(input_token_logprobs)
430
+ if output_token_logprobs is not None:
431
+ append_token_logprobs(output_token_logprobs)
432
+ if input_top_logprobs is not None:
433
+ append_top_logprobs(input_top_logprobs)
434
+ if output_top_logprobs is not None:
435
+ append_top_logprobs(output_top_logprobs)
436
436
 
437
437
  return ret_logprobs
@@ -152,7 +152,7 @@ class ChatCompletionRequest(BaseModel):
152
152
  logit_bias: Optional[Dict[str, float]] = None
153
153
  logprobs: Optional[bool] = False
154
154
  top_logprobs: Optional[int] = None
155
- max_tokens: Optional[int] = 16
155
+ max_tokens: Optional[int] = None
156
156
  n: Optional[int] = 1
157
157
  presence_penalty: Optional[float] = 0.0
158
158
  response_format: Optional[ResponseFormat] = None
@@ -65,10 +65,11 @@ class SamplingParams:
65
65
  raise ValueError(
66
66
  "presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
67
67
  )
68
- if self.max_new_tokens < 0:
69
- raise ValueError(
70
- f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
71
- )
68
+ if self.max_new_tokens is not None:
69
+ if self.max_new_tokens < 0:
70
+ raise ValueError(
71
+ f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
72
+ )
72
73
 
73
74
  def normalize(self, tokenizer):
74
75
  # Process stop strings