sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. sglang/bench_serving.py +72 -10
  2. sglang/srt/_custom_ops.py +59 -92
  3. sglang/srt/configs/deepseekvl2.py +10 -1
  4. sglang/srt/configs/model_config.py +6 -16
  5. sglang/srt/constrained/base_grammar_backend.py +5 -1
  6. sglang/srt/custom_op.py +5 -0
  7. sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
  8. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  9. sglang/srt/distributed/parallel_state.py +32 -5
  10. sglang/srt/entrypoints/engine.py +0 -5
  11. sglang/srt/entrypoints/http_server.py +7 -1
  12. sglang/srt/entrypoints/verl_engine.py +2 -0
  13. sglang/srt/function_call_parser.py +0 -1
  14. sglang/srt/layers/attention/flashattention_backend.py +582 -125
  15. sglang/srt/layers/attention/flashinfer_backend.py +5 -7
  16. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
  17. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  18. sglang/srt/layers/dp_attention.py +12 -1
  19. sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
  20. sglang/srt/layers/moe/ep_moe/layer.py +79 -80
  21. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
  26. sglang/srt/layers/moe/topk.py +79 -6
  27. sglang/srt/layers/quantization/__init__.py +137 -165
  28. sglang/srt/layers/quantization/awq.py +200 -0
  29. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
  30. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
  31. sglang/srt/layers/quantization/fp8_kernel.py +2 -1
  32. sglang/srt/layers/quantization/fp8_utils.py +1 -4
  33. sglang/srt/layers/quantization/gptq.py +30 -40
  34. sglang/srt/layers/quantization/moe_wna16.py +501 -0
  35. sglang/srt/layers/quantization/utils.py +1 -1
  36. sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
  37. sglang/srt/lora/backend/base_backend.py +4 -4
  38. sglang/srt/lora/backend/flashinfer_backend.py +12 -9
  39. sglang/srt/lora/backend/triton_backend.py +5 -8
  40. sglang/srt/lora/layers.py +19 -33
  41. sglang/srt/lora/lora_manager.py +20 -7
  42. sglang/srt/lora/mem_pool.py +12 -6
  43. sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
  44. sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
  45. sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
  46. sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
  47. sglang/srt/lora/utils.py +6 -0
  48. sglang/srt/managers/cache_controller.py +34 -11
  49. sglang/srt/managers/io_struct.py +4 -2
  50. sglang/srt/managers/mm_utils.py +202 -156
  51. sglang/srt/managers/multimodal_processor.py +0 -2
  52. sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
  53. sglang/srt/managers/multimodal_processors/clip.py +44 -0
  54. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
  55. sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
  56. sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
  57. sglang/srt/managers/multimodal_processors/llava.py +34 -14
  58. sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
  59. sglang/srt/managers/multimodal_processors/mlama.py +10 -23
  60. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
  61. sglang/srt/managers/schedule_batch.py +185 -127
  62. sglang/srt/managers/scheduler.py +29 -23
  63. sglang/srt/managers/tokenizer_manager.py +1 -2
  64. sglang/srt/managers/tp_worker.py +3 -0
  65. sglang/srt/managers/utils.py +1 -6
  66. sglang/srt/mem_cache/hiradix_cache.py +62 -52
  67. sglang/srt/mem_cache/memory_pool.py +72 -6
  68. sglang/srt/mem_cache/paged_allocator.py +39 -0
  69. sglang/srt/metrics/collector.py +23 -53
  70. sglang/srt/model_executor/cuda_graph_runner.py +16 -13
  71. sglang/srt/model_executor/forward_batch_info.py +10 -10
  72. sglang/srt/model_executor/model_runner.py +64 -59
  73. sglang/srt/model_loader/loader.py +19 -1
  74. sglang/srt/model_loader/weight_utils.py +6 -3
  75. sglang/srt/models/clip.py +568 -0
  76. sglang/srt/models/deepseek_janus_pro.py +12 -17
  77. sglang/srt/models/deepseek_v2.py +339 -123
  78. sglang/srt/models/deepseek_vl2.py +105 -104
  79. sglang/srt/models/gemma3_causal.py +12 -2
  80. sglang/srt/models/gemma3_mm.py +20 -80
  81. sglang/srt/models/llama.py +4 -1
  82. sglang/srt/models/llava.py +31 -19
  83. sglang/srt/models/llavavid.py +16 -7
  84. sglang/srt/models/minicpmo.py +63 -147
  85. sglang/srt/models/minicpmv.py +17 -27
  86. sglang/srt/models/mllama.py +29 -14
  87. sglang/srt/models/qwen2.py +9 -6
  88. sglang/srt/models/qwen2_5_vl.py +21 -31
  89. sglang/srt/models/qwen2_vl.py +20 -21
  90. sglang/srt/openai_api/adapter.py +106 -93
  91. sglang/srt/openai_api/protocol.py +10 -5
  92. sglang/srt/patch_torch.py +71 -0
  93. sglang/srt/platforms/interface.py +371 -0
  94. sglang/srt/server_args.py +120 -25
  95. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
  96. sglang/srt/speculative/eagle_utils.py +140 -28
  97. sglang/srt/speculative/eagle_worker.py +94 -25
  98. sglang/srt/utils.py +137 -51
  99. sglang/test/runners.py +27 -2
  100. sglang/test/test_custom_ops.py +55 -0
  101. sglang/test/test_utils.py +14 -27
  102. sglang/utils.py +2 -2
  103. sglang/version.py +1 -1
  104. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
  105. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
  106. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
  107. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
  108. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0
@@ -11,11 +11,11 @@ import tempfile
11
11
  from itertools import product
12
12
  from typing import Dict, List, Optional, Sequence
13
13
 
14
+ import torch
14
15
  import torch.distributed as dist
15
16
  import torch.multiprocessing as mp
16
17
 
17
18
  from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
18
- from sglang.srt.utils import cuda_device_count_stateless
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
@@ -218,7 +218,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
218
218
 
219
219
  is_distributed = dist.is_initialized()
220
220
 
221
- num_dev = cuda_device_count_stateless()
221
+ num_dev = torch.cuda.device_count()
222
222
  cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
223
223
  if cuda_visible_devices is None:
224
224
  cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
@@ -264,10 +264,16 @@ class GroupCoordinator:
264
264
  self.ca_comm: Optional[CustomAllreduce] = None
265
265
  if use_custom_allreduce and self.world_size > 1:
266
266
  # Initialize a custom fast all-reduce implementation.
267
- self.ca_comm = CustomAllreduce(
268
- group=self.cpu_group,
269
- device=self.device,
270
- )
267
+ try:
268
+ self.ca_comm = CustomAllreduce(
269
+ group=self.cpu_group,
270
+ device=self.device,
271
+ )
272
+ except Exception as e:
273
+ logger.warning(
274
+ f"Setup Custom allreduce failed with {e}. To silence this "
275
+ "warning, specify --disable-custom-all-reduce explicitly."
276
+ )
271
277
 
272
278
  from sglang.srt.distributed.device_communicators.hpu_communicator import (
273
279
  HpuCommunicator,
@@ -439,6 +445,15 @@ class GroupCoordinator:
439
445
  else:
440
446
  torch.distributed.all_reduce(input_, group=self.device_group)
441
447
 
448
+ def reduce_scatter(
449
+ self,
450
+ output: torch.Tensor,
451
+ input_list: List[torch.Tensor],
452
+ ) -> None:
453
+ # TODO(ch-wan): support other backends
454
+ torch.distributed.reduce_scatter(output, input_list, group=self.device_group)
455
+ return output
456
+
442
457
  def _all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
443
458
  pynccl_comm = self.pynccl_comm
444
459
  if pynccl_comm is not None and not pynccl_comm.disabled:
@@ -456,11 +471,23 @@ class GroupCoordinator:
456
471
  output, input, group_name=self.unique_name
457
472
  )
458
473
 
459
- def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
474
+ def all_gather(
475
+ self,
476
+ input_: torch.Tensor,
477
+ dim: int = -1,
478
+ tensor_list: List[torch.Tensor] = None,
479
+ ) -> torch.Tensor:
460
480
  world_size = self.world_size
461
481
  # Bypass the function if we are using only 1 GPU.
462
482
  if world_size == 1:
463
483
  return input_
484
+
485
+ if tensor_list is not None:
486
+ # TODO(ch-wan): support other backends
487
+ return torch.distributed.all_gather(
488
+ tensor_list, input_, group=self.device_group
489
+ )
490
+
464
491
  assert (
465
492
  -input_.dim() <= dim < input_.dim()
466
493
  ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
@@ -151,10 +151,6 @@ class Engine:
151
151
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
152
152
  Please refer to `GenerateReqInput` for the documentation.
153
153
  """
154
- modalities_list = []
155
- if image_data is not None:
156
- modalities_list.append("image")
157
-
158
154
  obj = GenerateReqInput(
159
155
  text=prompt,
160
156
  input_ids=input_ids,
@@ -165,7 +161,6 @@ class Engine:
165
161
  top_logprobs_num=top_logprobs_num,
166
162
  token_ids_logprob=token_ids_logprob,
167
163
  lora_path=lora_path,
168
- modalities=modalities_list,
169
164
  custom_logit_processor=custom_logit_processor,
170
165
  return_hidden_states=return_hidden_states,
171
166
  stream=stream,
@@ -561,7 +561,13 @@ def available_models():
561
561
  served_model_names = [_global_state.tokenizer_manager.served_model_name]
562
562
  model_cards = []
563
563
  for served_model_name in served_model_names:
564
- model_cards.append(ModelCard(id=served_model_name, root=served_model_name))
564
+ model_cards.append(
565
+ ModelCard(
566
+ id=served_model_name,
567
+ root=served_model_name,
568
+ max_model_len=_global_state.tokenizer_manager.model_config.context_len,
569
+ )
570
+ )
565
571
  return ModelList(data=model_cards)
566
572
 
567
573
 
@@ -19,6 +19,7 @@ import torch.distributed as dist
19
19
  from torch.distributed.tensor import DeviceMesh, DTensor
20
20
 
21
21
  from sglang.srt.model_executor.model_runner import LocalSerializedTensor
22
+ from sglang.srt.patch_torch import monkey_patch_torch_reductions
22
23
  from sglang.srt.server import Engine
23
24
  from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj
24
25
 
@@ -30,6 +31,7 @@ class VerlEngine:
30
31
  nnodes: int = 1,
31
32
  **kwargs,
32
33
  ):
34
+ monkey_patch_torch_reductions()
33
35
  self._device_mesh_cpu = device_mesh_cpu
34
36
  self._tp_rank = device_mesh_cpu.get_local_rank()
35
37
  self._tp_size = device_mesh_cpu.size()
@@ -290,7 +290,6 @@ class BaseFormatDetector(ABC):
290
290
  calls=[
291
291
  ToolCallItem(
292
292
  tool_index=self.current_tool_id,
293
- name="",
294
293
  parameters=argument_diff,
295
294
  )
296
295
  ],