sglang 0.3.6.post3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. sglang/bench_one_batch.py +4 -0
  2. sglang/bench_serving.py +13 -0
  3. sglang/check_env.py +1 -1
  4. sglang/srt/_custom_ops.py +118 -0
  5. sglang/srt/configs/device_config.py +17 -0
  6. sglang/srt/configs/load_config.py +84 -0
  7. sglang/srt/configs/model_config.py +161 -4
  8. sglang/srt/configs/qwen2vl.py +5 -8
  9. sglang/srt/constrained/outlines_backend.py +6 -1
  10. sglang/srt/constrained/outlines_jump_forward.py +8 -1
  11. sglang/srt/distributed/__init__.py +3 -0
  12. sglang/srt/distributed/communication_op.py +34 -0
  13. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  14. sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
  15. sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
  16. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
  17. sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
  18. sglang/srt/distributed/device_communicators/pynccl.py +204 -0
  19. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
  20. sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
  21. sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
  22. sglang/srt/distributed/parallel_state.py +1275 -0
  23. sglang/srt/distributed/utils.py +223 -0
  24. sglang/srt/hf_transformers_utils.py +37 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +13 -15
  26. sglang/srt/layers/attention/torch_native_backend.py +285 -0
  27. sglang/srt/layers/fused_moe_patch.py +20 -11
  28. sglang/srt/layers/linear.py +1 -0
  29. sglang/srt/layers/logits_processor.py +17 -3
  30. sglang/srt/layers/quantization/__init__.py +34 -0
  31. sglang/srt/layers/vocab_parallel_embedding.py +1 -0
  32. sglang/srt/lora/lora.py +1 -1
  33. sglang/srt/managers/io_struct.py +48 -2
  34. sglang/srt/managers/schedule_batch.py +18 -14
  35. sglang/srt/managers/schedule_policy.py +7 -4
  36. sglang/srt/managers/scheduler.py +76 -20
  37. sglang/srt/managers/tokenizer_manager.py +166 -68
  38. sglang/srt/managers/tp_worker.py +36 -3
  39. sglang/srt/managers/tp_worker_overlap_thread.py +21 -3
  40. sglang/srt/model_executor/cuda_graph_runner.py +16 -7
  41. sglang/srt/model_executor/forward_batch_info.py +9 -4
  42. sglang/srt/model_executor/model_runner.py +136 -150
  43. sglang/srt/model_loader/__init__.py +34 -0
  44. sglang/srt/model_loader/loader.py +1139 -0
  45. sglang/srt/model_loader/utils.py +41 -0
  46. sglang/srt/model_loader/weight_utils.py +640 -0
  47. sglang/srt/models/baichuan.py +9 -10
  48. sglang/srt/models/chatglm.py +6 -15
  49. sglang/srt/models/commandr.py +2 -3
  50. sglang/srt/models/dbrx.py +2 -3
  51. sglang/srt/models/deepseek.py +4 -11
  52. sglang/srt/models/deepseek_v2.py +3 -11
  53. sglang/srt/models/exaone.py +2 -3
  54. sglang/srt/models/gemma.py +2 -6
  55. sglang/srt/models/gemma2.py +3 -14
  56. sglang/srt/models/gemma2_reward.py +0 -1
  57. sglang/srt/models/gpt2.py +5 -12
  58. sglang/srt/models/gpt_bigcode.py +6 -22
  59. sglang/srt/models/grok.py +3 -3
  60. sglang/srt/models/internlm2.py +2 -3
  61. sglang/srt/models/internlm2_reward.py +0 -1
  62. sglang/srt/models/llama.py +97 -27
  63. sglang/srt/models/llama_classification.py +1 -2
  64. sglang/srt/models/llama_embedding.py +1 -2
  65. sglang/srt/models/llama_reward.py +2 -3
  66. sglang/srt/models/llava.py +1 -4
  67. sglang/srt/models/llavavid.py +1 -2
  68. sglang/srt/models/minicpm.py +4 -7
  69. sglang/srt/models/minicpm3.py +6 -19
  70. sglang/srt/models/mixtral.py +12 -5
  71. sglang/srt/models/mixtral_quant.py +2 -3
  72. sglang/srt/models/mllama.py +3 -7
  73. sglang/srt/models/olmo.py +2 -8
  74. sglang/srt/models/olmo2.py +0 -1
  75. sglang/srt/models/olmoe.py +3 -5
  76. sglang/srt/models/phi3_small.py +8 -8
  77. sglang/srt/models/qwen.py +2 -3
  78. sglang/srt/models/qwen2.py +10 -9
  79. sglang/srt/models/qwen2_moe.py +4 -11
  80. sglang/srt/models/qwen2_vl.py +2 -6
  81. sglang/srt/models/registry.py +99 -0
  82. sglang/srt/models/stablelm.py +2 -3
  83. sglang/srt/models/torch_native_llama.py +6 -12
  84. sglang/srt/models/xverse.py +2 -4
  85. sglang/srt/models/xverse_moe.py +4 -11
  86. sglang/srt/models/yivl.py +2 -3
  87. sglang/srt/openai_api/adapter.py +9 -5
  88. sglang/srt/openai_api/protocol.py +1 -0
  89. sglang/srt/server.py +267 -170
  90. sglang/srt/server_args.py +65 -31
  91. sglang/srt/utils.py +245 -28
  92. sglang/test/test_utils.py +7 -0
  93. sglang/version.py +1 -1
  94. {sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/METADATA +1 -1
  95. sglang-0.4.0.dist-info/RECORD +184 -0
  96. sglang-0.3.6.post3.dist-info/RECORD +0 -162
  97. {sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/LICENSE +0 -0
  98. {sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/WHEEL +0 -0
  99. {sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/top_level.txt +0 -0
@@ -52,7 +52,6 @@ from vllm.distributed import (
52
52
  get_tensor_model_parallel_world_size,
53
53
  )
54
54
  from vllm.model_executor.layers.rotary_embedding import get_rope
55
- from vllm.model_executor.model_loader.weight_utils import default_weight_loader
56
55
 
57
56
  from sglang.srt.layers.activation import SiluAndMul
58
57
  from sglang.srt.layers.layernorm import RMSNorm
@@ -66,6 +65,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
66
65
  )
67
66
  from sglang.srt.managers.schedule_batch import global_server_args_dict
68
67
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
68
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
69
69
 
70
70
  tp_size = get_tensor_model_parallel_world_size()
71
71
  tp_rank = get_tensor_model_parallel_rank()
@@ -388,7 +388,6 @@ class TorchNativeLlamaForCausalLM(nn.Module):
388
388
  self,
389
389
  config: LlamaConfig,
390
390
  quant_config: Optional[QuantizationConfig] = None,
391
- cache_config=None,
392
391
  ) -> None:
393
392
  super().__init__()
394
393
  self.config = config
@@ -396,7 +395,10 @@ class TorchNativeLlamaForCausalLM(nn.Module):
396
395
  self.torchao_config = global_server_args_dict["torchao_config"]
397
396
  self.supports_torch_tp = True
398
397
  self.model = LlamaModel(config, quant_config=quant_config)
399
- self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
398
+ if self.config.tie_word_embeddings:
399
+ self.lm_head = self.model.embed_tokens
400
+ else:
401
+ self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
400
402
  self.logits_processor = LogitsProcessor(config)
401
403
 
402
404
  # turning off autotune for fp8dq since it doesn't give speedup and
@@ -413,7 +415,7 @@ class TorchNativeLlamaForCausalLM(nn.Module):
413
415
  ) -> LogitsProcessorOutput:
414
416
  hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
415
417
  return self.logits_processor(
416
- input_ids, hidden_states, self.lm_head.weight, forward_batch
418
+ input_ids, hidden_states, self.lm_head, forward_batch
417
419
  )
418
420
 
419
421
  def get_hidden_dim(self, module_name):
@@ -501,14 +503,6 @@ class TorchNativeLlamaForCausalLM(nn.Module):
501
503
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
502
504
  weight_loader(param, loaded_weight)
503
505
 
504
- if (
505
- hasattr(self.config, "tie_word_embeddings")
506
- and self.config.tie_word_embeddings
507
- ):
508
- # Tie output embedding layer to input embedding layer, to solve issues where lm_head.weight is missing
509
- param = self.lm_head.weight
510
- weight_loader = getattr(param, "weight_loader", default_weight_loader)
511
- weight_loader(param, self.model.embed_tokens.weight)
512
506
  apply_torchao_config_(self, params_dict, set(["proj.weight"]))
513
507
 
514
508
 
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.linear import (
30
30
  RowParallelLinear,
31
31
  )
32
32
  from vllm.model_executor.layers.rotary_embedding import get_rope
33
- from vllm.model_executor.model_loader.weight_utils import default_weight_loader
34
33
 
35
34
  from sglang.srt.layers.logits_processor import LogitsProcessor
36
35
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -40,6 +39,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
40
39
  VocabParallelEmbedding,
41
40
  )
42
41
  from sglang.srt.model_executor.model_runner import ForwardBatch
42
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
43
43
 
44
44
 
45
45
  class XverseMLP(nn.Module):
@@ -295,8 +295,6 @@ class XverseForCausalLM(nn.Module):
295
295
  self,
296
296
  config: LlamaConfig,
297
297
  quant_config: Optional[QuantizationConfig] = None,
298
- cache_config=None,
299
- efficient_weight_load=False,
300
298
  ) -> None:
301
299
  super().__init__()
302
300
  self.config = config
@@ -315,7 +313,7 @@ class XverseForCausalLM(nn.Module):
315
313
  ) -> torch.Tensor:
316
314
  hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
317
315
  return self.logits_processor(
318
- input_ids, hidden_states, self.lm_head.weight, forward_batch
316
+ input_ids, hidden_states, self.lm_head, forward_batch
319
317
  )
320
318
 
321
319
  def load_weights(
@@ -32,7 +32,6 @@ from vllm.model_executor.layers.linear import (
32
32
  RowParallelLinear,
33
33
  )
34
34
  from vllm.model_executor.layers.rotary_embedding import get_rope
35
- from vllm.model_executor.model_loader.weight_utils import default_weight_loader
36
35
 
37
36
  from sglang.srt.layers.fused_moe_triton import fused_moe
38
37
  from sglang.srt.layers.logits_processor import LogitsProcessor
@@ -43,6 +42,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
43
42
  VocabParallelEmbedding,
44
43
  )
45
44
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
45
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
46
46
 
47
47
 
48
48
  class XverseMLP(nn.Module):
@@ -181,7 +181,6 @@ class XverseAttention(nn.Module):
181
181
  rope_theta: float = 10000,
182
182
  rope_scaling: Optional[Dict[str, Any]] = None,
183
183
  max_position_embeddings: int = 8192,
184
- cache_config=None,
185
184
  quant_config: Optional[QuantizationConfig] = None,
186
185
  ) -> None:
187
186
  super().__init__()
@@ -258,7 +257,6 @@ class XverseDecoderLayer(nn.Module):
258
257
  self,
259
258
  config: PretrainedConfig,
260
259
  layer_id: int,
261
- cache_config=None,
262
260
  quant_config: Optional[QuantizationConfig] = None,
263
261
  ) -> None:
264
262
  super().__init__()
@@ -277,7 +275,6 @@ class XverseDecoderLayer(nn.Module):
277
275
  rope_theta=rope_theta,
278
276
  rope_scaling=rope_scaling,
279
277
  max_position_embeddings=max_position_embeddings,
280
- cache_config=cache_config,
281
278
  quant_config=quant_config,
282
279
  )
283
280
  if config.num_experts is not None:
@@ -326,7 +323,6 @@ class XverseModel(nn.Module):
326
323
  def __init__(
327
324
  self,
328
325
  config: PretrainedConfig,
329
- cache_config=None,
330
326
  quant_config: Optional[QuantizationConfig] = None,
331
327
  ) -> None:
332
328
  super().__init__()
@@ -339,9 +335,7 @@ class XverseModel(nn.Module):
339
335
  )
340
336
  self.layers = nn.ModuleList(
341
337
  [
342
- XverseDecoderLayer(
343
- config, layer_id, cache_config, quant_config=quant_config
344
- )
338
+ XverseDecoderLayer(config, layer_id, quant_config=quant_config)
345
339
  for layer_id in range(config.num_hidden_layers)
346
340
  ]
347
341
  )
@@ -369,13 +363,12 @@ class XverseMoeForCausalLM(nn.Module):
369
363
  def __init__(
370
364
  self,
371
365
  config: PretrainedConfig,
372
- cache_config=None,
373
366
  quant_config: Optional[QuantizationConfig] = None,
374
367
  ) -> None:
375
368
  super().__init__()
376
369
  self.config = config
377
370
  self.quant_config = quant_config
378
- self.model = XverseModel(config, cache_config, quant_config)
371
+ self.model = XverseModel(config, quant_config)
379
372
  self.lm_head = ParallelLMHead(
380
373
  config.vocab_size, config.hidden_size, quant_config=quant_config
381
374
  )
@@ -390,7 +383,7 @@ class XverseMoeForCausalLM(nn.Module):
390
383
  ) -> torch.Tensor:
391
384
  hidden_states = self.model(input_ids, positions, forward_batch)
392
385
  return self.logits_processor(
393
- input_ids, hidden_states, self.lm_head.weight, forward_batch
386
+ input_ids, hidden_states, self.lm_head, forward_batch
394
387
  )
395
388
 
396
389
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
sglang/srt/models/yivl.py CHANGED
@@ -18,9 +18,9 @@ from typing import Iterable, Optional, Tuple
18
18
  import torch
19
19
  import torch.nn as nn
20
20
  from transformers import CLIPVisionModel, LlavaConfig
21
- from vllm.model_executor.model_loader.weight_utils import default_weight_loader
22
21
 
23
22
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
23
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
24
24
  from sglang.srt.models.llava import LlavaLlamaForCausalLM
25
25
 
26
26
 
@@ -29,9 +29,8 @@ class YiVLForCausalLM(LlavaLlamaForCausalLM):
29
29
  self,
30
30
  config: LlavaConfig,
31
31
  quant_config: Optional[QuantizationConfig] = None,
32
- cache_config=None,
33
32
  ) -> None:
34
- super().__init__(config, quant_config, cache_config)
33
+ super().__init__(config, quant_config)
35
34
 
36
35
  self.multi_modal_projector = YiVLMultiModalProjector(self.config)
37
36
  self.vision_tower_subfolder = self.config.mm_vision_tower.replace(
@@ -486,6 +486,7 @@ def v1_generate_request(
486
486
  return_logprobs = []
487
487
  logprob_start_lens = []
488
488
  top_logprobs_nums = []
489
+ lora_paths = []
489
490
 
490
491
  for request in all_requests:
491
492
  # NOTE: with openai API, the prompt's logprobs are always not computed
@@ -496,6 +497,7 @@ def v1_generate_request(
496
497
  )
497
498
 
498
499
  prompts.append(request.prompt)
500
+ lora_paths.append(request.lora_path)
499
501
  if request.echo and request.logprobs:
500
502
  current_logprob_start_len = 0
501
503
  else:
@@ -519,7 +521,7 @@ def v1_generate_request(
519
521
  "skip_special_tokens": request.skip_special_tokens,
520
522
  }
521
523
  )
522
- return_logprobs.append(request.logprobs is not None and request.logprobs > 0)
524
+ return_logprobs.append(request.logprobs is not None)
523
525
  logprob_start_lens.append(current_logprob_start_len)
524
526
  top_logprobs_nums.append(
525
527
  request.logprobs if request.logprobs is not None else 0
@@ -534,6 +536,7 @@ def v1_generate_request(
534
536
  return_logprobs = return_logprobs[0]
535
537
  logprob_start_lens = logprob_start_lens[0]
536
538
  top_logprobs_nums = top_logprobs_nums[0]
539
+ lora_paths = lora_paths[0]
537
540
  else:
538
541
  if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
539
542
  prompt_kwargs = {"text": prompts}
@@ -549,6 +552,7 @@ def v1_generate_request(
549
552
  return_text_in_logprobs=True,
550
553
  stream=all_requests[0].stream,
551
554
  rid=request_ids,
555
+ lora_path=lora_paths,
552
556
  )
553
557
 
554
558
  return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
@@ -591,9 +595,9 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
591
595
  text = prompts[prompt_index] + text
592
596
 
593
597
  logprobs = False
594
- if isinstance(request, list) and request[idx].logprobs:
598
+ if isinstance(request, list) and request[idx].logprobs is not None:
595
599
  logprobs = True
596
- elif (not isinstance(request, list)) and request.logprobs:
600
+ elif (not isinstance(request, list)) and request.logprobs is not None:
597
601
  logprobs = True
598
602
  if logprobs:
599
603
  if echo:
@@ -735,7 +739,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
735
739
  # Prepend prompt in response text.
736
740
  text = prompts + text
737
741
 
738
- if request.logprobs:
742
+ if request.logprobs is not None:
739
743
  # The first chunk and echo is enabled.
740
744
  if not stream_buffer and request.echo:
741
745
  input_token_logprobs = content["meta_info"][
@@ -1275,7 +1279,7 @@ def v1_embedding_request(all_requests, tokenizer_manager):
1275
1279
  for request in all_requests:
1276
1280
  prompt = request.input
1277
1281
  assert (
1278
- type(prompt) == first_prompt_type
1282
+ type(prompt) is first_prompt_type
1279
1283
  ), "All prompts must be of the same type in file input settings"
1280
1284
  prompts.append(prompt)
1281
1285
 
@@ -166,6 +166,7 @@ class CompletionRequest(BaseModel):
166
166
  temperature: float = 1.0
167
167
  top_p: float = 1.0
168
168
  user: Optional[str] = None
169
+ lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
169
170
 
170
171
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
171
172
  json_schema: Optional[str] = None