sglang 0.4.5__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. sglang/bench_one_batch.py +21 -0
  2. sglang/bench_serving.py +10 -4
  3. sglang/srt/configs/model_config.py +37 -5
  4. sglang/srt/constrained/base_grammar_backend.py +26 -5
  5. sglang/srt/constrained/llguidance_backend.py +1 -0
  6. sglang/srt/constrained/outlines_backend.py +1 -0
  7. sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
  8. sglang/srt/constrained/xgrammar_backend.py +1 -0
  9. sglang/srt/disaggregation/base/__init__.py +8 -0
  10. sglang/srt/disaggregation/base/conn.py +113 -0
  11. sglang/srt/disaggregation/decode.py +18 -5
  12. sglang/srt/disaggregation/mini_lb.py +53 -122
  13. sglang/srt/disaggregation/mooncake/__init__.py +6 -0
  14. sglang/srt/disaggregation/mooncake/conn.py +615 -0
  15. sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
  16. sglang/srt/disaggregation/prefill.py +43 -19
  17. sglang/srt/disaggregation/utils.py +31 -0
  18. sglang/srt/entrypoints/EngineBase.py +53 -0
  19. sglang/srt/entrypoints/engine.py +36 -8
  20. sglang/srt/entrypoints/http_server.py +37 -8
  21. sglang/srt/entrypoints/http_server_engine.py +142 -0
  22. sglang/srt/entrypoints/verl_engine.py +37 -10
  23. sglang/srt/hf_transformers_utils.py +4 -0
  24. sglang/srt/layers/attention/flashattention_backend.py +330 -200
  25. sglang/srt/layers/attention/flashinfer_backend.py +13 -7
  26. sglang/srt/layers/attention/vision.py +1 -1
  27. sglang/srt/layers/dp_attention.py +2 -4
  28. sglang/srt/layers/elementwise.py +15 -2
  29. sglang/srt/layers/linear.py +1 -0
  30. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +38 -21
  38. sglang/srt/layers/moe/router.py +7 -1
  39. sglang/srt/layers/moe/topk.py +37 -16
  40. sglang/srt/layers/quantization/__init__.py +12 -5
  41. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
  42. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
  43. sglang/srt/layers/quantization/fp8.py +25 -13
  44. sglang/srt/layers/quantization/fp8_kernel.py +130 -4
  45. sglang/srt/layers/quantization/fp8_utils.py +34 -6
  46. sglang/srt/layers/quantization/kv_cache.py +43 -52
  47. sglang/srt/layers/quantization/modelopt_quant.py +271 -4
  48. sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
  49. sglang/srt/layers/quantization/w8a8_int8.py +1 -0
  50. sglang/srt/layers/radix_attention.py +13 -1
  51. sglang/srt/layers/rotary_embedding.py +12 -1
  52. sglang/srt/managers/io_struct.py +254 -97
  53. sglang/srt/managers/mm_utils.py +3 -2
  54. sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
  55. sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
  56. sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
  57. sglang/srt/managers/schedule_batch.py +62 -21
  58. sglang/srt/managers/scheduler.py +71 -14
  59. sglang/srt/managers/tokenizer_manager.py +17 -3
  60. sglang/srt/managers/tp_worker.py +1 -0
  61. sglang/srt/mem_cache/memory_pool.py +14 -1
  62. sglang/srt/metrics/collector.py +9 -0
  63. sglang/srt/model_executor/cuda_graph_runner.py +7 -4
  64. sglang/srt/model_executor/forward_batch_info.py +234 -15
  65. sglang/srt/model_executor/model_runner.py +48 -9
  66. sglang/srt/model_loader/loader.py +31 -4
  67. sglang/srt/model_loader/weight_utils.py +4 -2
  68. sglang/srt/models/baichuan.py +2 -0
  69. sglang/srt/models/chatglm.py +1 -0
  70. sglang/srt/models/commandr.py +1 -0
  71. sglang/srt/models/dbrx.py +1 -0
  72. sglang/srt/models/deepseek.py +1 -0
  73. sglang/srt/models/deepseek_v2.py +248 -61
  74. sglang/srt/models/exaone.py +1 -0
  75. sglang/srt/models/gemma.py +1 -0
  76. sglang/srt/models/gemma2.py +1 -0
  77. sglang/srt/models/gemma3_causal.py +1 -0
  78. sglang/srt/models/gpt2.py +1 -0
  79. sglang/srt/models/gpt_bigcode.py +1 -0
  80. sglang/srt/models/granite.py +1 -0
  81. sglang/srt/models/grok.py +1 -0
  82. sglang/srt/models/internlm2.py +1 -0
  83. sglang/srt/models/llama.py +1 -0
  84. sglang/srt/models/llama4.py +101 -34
  85. sglang/srt/models/minicpm.py +1 -0
  86. sglang/srt/models/minicpm3.py +2 -0
  87. sglang/srt/models/mixtral.py +1 -0
  88. sglang/srt/models/mixtral_quant.py +1 -0
  89. sglang/srt/models/mllama.py +51 -8
  90. sglang/srt/models/mllama4.py +102 -29
  91. sglang/srt/models/olmo.py +1 -0
  92. sglang/srt/models/olmo2.py +1 -0
  93. sglang/srt/models/olmoe.py +1 -0
  94. sglang/srt/models/phi3_small.py +1 -0
  95. sglang/srt/models/qwen.py +1 -0
  96. sglang/srt/models/qwen2.py +1 -0
  97. sglang/srt/models/qwen2_5_vl.py +35 -70
  98. sglang/srt/models/qwen2_moe.py +1 -0
  99. sglang/srt/models/qwen2_vl.py +27 -25
  100. sglang/srt/models/stablelm.py +1 -0
  101. sglang/srt/models/xverse.py +1 -0
  102. sglang/srt/models/xverse_moe.py +1 -0
  103. sglang/srt/openai_api/adapter.py +4 -1
  104. sglang/srt/patch_torch.py +11 -0
  105. sglang/srt/server_args.py +34 -0
  106. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
  107. sglang/srt/speculative/eagle_utils.py +1 -11
  108. sglang/srt/speculative/eagle_worker.py +6 -2
  109. sglang/srt/utils.py +120 -9
  110. sglang/test/attention/test_flashattn_backend.py +259 -221
  111. sglang/test/attention/test_flashattn_mla_backend.py +285 -0
  112. sglang/test/attention/test_prefix_chunk_info.py +224 -0
  113. sglang/test/test_block_fp8.py +57 -0
  114. sglang/test/test_utils.py +19 -8
  115. sglang/version.py +1 -1
  116. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/METADATA +14 -4
  117. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/RECORD +120 -106
  118. sglang/srt/disaggregation/conn.py +0 -81
  119. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/WHEEL +0 -0
  120. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/licenses/LICENSE +0 -0
  121. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -16,6 +16,7 @@ import base64
16
16
  import builtins
17
17
  import ctypes
18
18
  import dataclasses
19
+ import importlib
19
20
  import io
20
21
  import ipaddress
21
22
  import itertools
@@ -127,7 +128,7 @@ def is_flashinfer_available():
127
128
  """
128
129
  if not get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"):
129
130
  return False
130
- return is_cuda()
131
+ return importlib.util.find_spec("flashinfer") is not None and is_cuda()
131
132
 
132
133
 
133
134
  def is_cuda_available():
@@ -568,7 +569,7 @@ def encode_video(video_path, frame_count_limit=None):
568
569
 
569
570
 
570
571
  def load_image(
571
- image_file: Union[Image.Image, str, bytes]
572
+ image_file: Union[Image.Image, str, bytes],
572
573
  ) -> tuple[Image.Image, tuple[int, int]]:
573
574
  image = image_size = None
574
575
  if isinstance(image_file, Image.Image):
@@ -845,33 +846,38 @@ def broadcast_pyobj(
845
846
  rank: int,
846
847
  dist_group: Optional[torch.distributed.ProcessGroup] = None,
847
848
  src: int = 0,
849
+ force_cpu_device: bool = True,
848
850
  ):
849
851
  """Broadcast inputs from rank=0 to all other ranks with torch.dist backend."""
852
+ device = torch.device(
853
+ "cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
854
+ )
850
855
 
851
856
  if rank == 0:
852
857
  if len(data) == 0:
853
- tensor_size = torch.tensor([0], dtype=torch.long)
858
+ tensor_size = torch.tensor([0], dtype=torch.long, device=device)
854
859
  dist.broadcast(tensor_size, src=src, group=dist_group)
855
860
  else:
856
861
  serialized_data = pickle.dumps(data)
857
862
  size = len(serialized_data)
863
+
858
864
  tensor_data = torch.ByteTensor(
859
865
  np.frombuffer(serialized_data, dtype=np.uint8)
860
- )
861
- tensor_size = torch.tensor([size], dtype=torch.long)
866
+ ).to(device)
867
+ tensor_size = torch.tensor([size], dtype=torch.long, device=device)
862
868
 
863
869
  dist.broadcast(tensor_size, src=src, group=dist_group)
864
870
  dist.broadcast(tensor_data, src=src, group=dist_group)
865
871
  return data
866
872
  else:
867
- tensor_size = torch.tensor([0], dtype=torch.long)
873
+ tensor_size = torch.tensor([0], dtype=torch.long, device=device)
868
874
  dist.broadcast(tensor_size, src=src, group=dist_group)
869
875
  size = tensor_size.item()
870
876
 
871
877
  if size == 0:
872
878
  return []
873
879
 
874
- tensor_data = torch.empty(size, dtype=torch.uint8)
880
+ tensor_data = torch.empty(size, dtype=torch.uint8, device=device)
875
881
  dist.broadcast(tensor_data, src=src, group=dist_group)
876
882
 
877
883
  serialized_data = bytes(tensor_data.cpu().numpy())
@@ -1480,14 +1486,43 @@ def permute_weight(x: torch.Tensor) -> torch.Tensor:
1480
1486
 
1481
1487
  class MultiprocessingSerializer:
1482
1488
  @staticmethod
1483
- def serialize(obj):
1489
+ def serialize(obj, output_str: bool = False):
1490
+ """
1491
+ Serialize a Python object using ForkingPickler.
1492
+
1493
+ Args:
1494
+ obj: The object to serialize.
1495
+ output_str (bool): If True, return a base64-encoded string instead of raw bytes.
1496
+
1497
+ Returns:
1498
+ bytes or str: The serialized object.
1499
+ """
1484
1500
  buf = io.BytesIO()
1485
1501
  ForkingPickler(buf).dump(obj)
1486
1502
  buf.seek(0)
1487
- return buf.read()
1503
+ output = buf.read()
1504
+
1505
+ if output_str:
1506
+ # Convert bytes to base64-encoded string
1507
+ output = base64.b64encode(output).decode("utf-8")
1508
+
1509
+ return output
1488
1510
 
1489
1511
  @staticmethod
1490
1512
  def deserialize(data):
1513
+ """
1514
+ Deserialize a previously serialized object.
1515
+
1516
+ Args:
1517
+ data (bytes or str): The serialized data, optionally base64-encoded.
1518
+
1519
+ Returns:
1520
+ The deserialized Python object.
1521
+ """
1522
+ if isinstance(data, str):
1523
+ # Decode base64 string to bytes
1524
+ data = base64.b64decode(data)
1525
+
1491
1526
  return ForkingPickler.loads(data)
1492
1527
 
1493
1528
 
@@ -1819,3 +1854,79 @@ class DeepEPMode(Enum):
1819
1854
  return DeepEPMode.low_latency
1820
1855
  else:
1821
1856
  return DeepEPMode.normal
1857
+
1858
+
1859
+ def fast_topk(values, topk, dim):
1860
+ if topk == 1:
1861
+ # Use max along the specified dimension to get both value and index
1862
+ return torch.max(values, dim=dim, keepdim=True)
1863
+ else:
1864
+ # Use topk for efficiency with larger k values
1865
+ return torch.topk(values, topk, dim=dim)
1866
+
1867
+
1868
+ def is_hopper_with_cuda_12_3():
1869
+ if not is_cuda():
1870
+ return False
1871
+ is_hopper = torch.cuda.get_device_capability()[0] == 9
1872
+ cuda_version = torch.version.cuda.split(".")
1873
+ is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3
1874
+ return is_hopper and is_cuda_compatible
1875
+
1876
+
1877
+ def get_free_port():
1878
+ # try ipv4
1879
+ try:
1880
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1881
+ s.bind(("", 0))
1882
+ return s.getsockname()[1]
1883
+ except OSError:
1884
+ # try ipv6
1885
+ with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
1886
+ s.bind(("", 0))
1887
+ return s.getsockname()[1]
1888
+
1889
+
1890
+ def get_local_ip_by_remote() -> str:
1891
+ # try ipv4
1892
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
1893
+ try:
1894
+ s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable
1895
+ return s.getsockname()[0]
1896
+ except Exception:
1897
+ pass
1898
+
1899
+ # try ipv6
1900
+ try:
1901
+ s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
1902
+ # Google's public DNS server, see
1903
+ # https://developers.google.com/speed/public-dns/docs/using#addresses
1904
+ s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
1905
+ return s.getsockname()[0]
1906
+ except Exception:
1907
+ raise ValueError(f"Can not get local ip")
1908
+
1909
+
1910
+ def is_page_size_one(server_args):
1911
+ return server_args.page_size == 1
1912
+
1913
+
1914
+ def is_no_spec_infer_or_topk_one(server_args):
1915
+ return server_args.speculative_eagle_topk is None or (
1916
+ server_args.speculative_eagle_topk is not None
1917
+ and server_args.speculative_eagle_topk == 1
1918
+ and is_page_size_one(server_args)
1919
+ )
1920
+
1921
+
1922
+ def is_fa3_default_architecture(hf_config):
1923
+ architectures = getattr(hf_config, "architectures", None)
1924
+ if not isinstance(architectures, list) or not architectures:
1925
+ return False
1926
+ default_archs = {
1927
+ "Qwen2ForCausalLM",
1928
+ "Llama4ForConditionalGeneration",
1929
+ "LlamaForCausalLM",
1930
+ "MistralForCausalLM",
1931
+ }
1932
+ return architectures[0] in default_archs