sglang 0.4.5__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. sglang/bench_one_batch.py +21 -0
  2. sglang/bench_serving.py +10 -4
  3. sglang/srt/configs/model_config.py +37 -5
  4. sglang/srt/constrained/base_grammar_backend.py +26 -5
  5. sglang/srt/constrained/llguidance_backend.py +1 -0
  6. sglang/srt/constrained/outlines_backend.py +1 -0
  7. sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
  8. sglang/srt/constrained/xgrammar_backend.py +1 -0
  9. sglang/srt/disaggregation/base/__init__.py +8 -0
  10. sglang/srt/disaggregation/base/conn.py +113 -0
  11. sglang/srt/disaggregation/decode.py +18 -5
  12. sglang/srt/disaggregation/mini_lb.py +53 -122
  13. sglang/srt/disaggregation/mooncake/__init__.py +6 -0
  14. sglang/srt/disaggregation/mooncake/conn.py +615 -0
  15. sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
  16. sglang/srt/disaggregation/prefill.py +43 -19
  17. sglang/srt/disaggregation/utils.py +31 -0
  18. sglang/srt/entrypoints/EngineBase.py +53 -0
  19. sglang/srt/entrypoints/engine.py +36 -8
  20. sglang/srt/entrypoints/http_server.py +37 -8
  21. sglang/srt/entrypoints/http_server_engine.py +142 -0
  22. sglang/srt/entrypoints/verl_engine.py +37 -10
  23. sglang/srt/hf_transformers_utils.py +4 -0
  24. sglang/srt/layers/attention/flashattention_backend.py +330 -200
  25. sglang/srt/layers/attention/flashinfer_backend.py +13 -7
  26. sglang/srt/layers/attention/vision.py +1 -1
  27. sglang/srt/layers/dp_attention.py +2 -4
  28. sglang/srt/layers/elementwise.py +15 -2
  29. sglang/srt/layers/linear.py +1 -0
  30. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +38 -21
  38. sglang/srt/layers/moe/router.py +7 -1
  39. sglang/srt/layers/moe/topk.py +37 -16
  40. sglang/srt/layers/quantization/__init__.py +12 -5
  41. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
  42. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
  43. sglang/srt/layers/quantization/fp8.py +25 -13
  44. sglang/srt/layers/quantization/fp8_kernel.py +130 -4
  45. sglang/srt/layers/quantization/fp8_utils.py +34 -6
  46. sglang/srt/layers/quantization/kv_cache.py +43 -52
  47. sglang/srt/layers/quantization/modelopt_quant.py +271 -4
  48. sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
  49. sglang/srt/layers/quantization/w8a8_int8.py +1 -0
  50. sglang/srt/layers/radix_attention.py +13 -1
  51. sglang/srt/layers/rotary_embedding.py +12 -1
  52. sglang/srt/managers/io_struct.py +254 -97
  53. sglang/srt/managers/mm_utils.py +3 -2
  54. sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
  55. sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
  56. sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
  57. sglang/srt/managers/schedule_batch.py +62 -21
  58. sglang/srt/managers/scheduler.py +71 -14
  59. sglang/srt/managers/tokenizer_manager.py +17 -3
  60. sglang/srt/managers/tp_worker.py +1 -0
  61. sglang/srt/mem_cache/memory_pool.py +14 -1
  62. sglang/srt/metrics/collector.py +9 -0
  63. sglang/srt/model_executor/cuda_graph_runner.py +7 -4
  64. sglang/srt/model_executor/forward_batch_info.py +234 -15
  65. sglang/srt/model_executor/model_runner.py +48 -9
  66. sglang/srt/model_loader/loader.py +31 -4
  67. sglang/srt/model_loader/weight_utils.py +4 -2
  68. sglang/srt/models/baichuan.py +2 -0
  69. sglang/srt/models/chatglm.py +1 -0
  70. sglang/srt/models/commandr.py +1 -0
  71. sglang/srt/models/dbrx.py +1 -0
  72. sglang/srt/models/deepseek.py +1 -0
  73. sglang/srt/models/deepseek_v2.py +248 -61
  74. sglang/srt/models/exaone.py +1 -0
  75. sglang/srt/models/gemma.py +1 -0
  76. sglang/srt/models/gemma2.py +1 -0
  77. sglang/srt/models/gemma3_causal.py +1 -0
  78. sglang/srt/models/gpt2.py +1 -0
  79. sglang/srt/models/gpt_bigcode.py +1 -0
  80. sglang/srt/models/granite.py +1 -0
  81. sglang/srt/models/grok.py +1 -0
  82. sglang/srt/models/internlm2.py +1 -0
  83. sglang/srt/models/llama.py +1 -0
  84. sglang/srt/models/llama4.py +101 -34
  85. sglang/srt/models/minicpm.py +1 -0
  86. sglang/srt/models/minicpm3.py +2 -0
  87. sglang/srt/models/mixtral.py +1 -0
  88. sglang/srt/models/mixtral_quant.py +1 -0
  89. sglang/srt/models/mllama.py +51 -8
  90. sglang/srt/models/mllama4.py +102 -29
  91. sglang/srt/models/olmo.py +1 -0
  92. sglang/srt/models/olmo2.py +1 -0
  93. sglang/srt/models/olmoe.py +1 -0
  94. sglang/srt/models/phi3_small.py +1 -0
  95. sglang/srt/models/qwen.py +1 -0
  96. sglang/srt/models/qwen2.py +1 -0
  97. sglang/srt/models/qwen2_5_vl.py +35 -70
  98. sglang/srt/models/qwen2_moe.py +1 -0
  99. sglang/srt/models/qwen2_vl.py +27 -25
  100. sglang/srt/models/stablelm.py +1 -0
  101. sglang/srt/models/xverse.py +1 -0
  102. sglang/srt/models/xverse_moe.py +1 -0
  103. sglang/srt/openai_api/adapter.py +4 -1
  104. sglang/srt/patch_torch.py +11 -0
  105. sglang/srt/server_args.py +34 -0
  106. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
  107. sglang/srt/speculative/eagle_utils.py +1 -11
  108. sglang/srt/speculative/eagle_worker.py +6 -2
  109. sglang/srt/utils.py +120 -9
  110. sglang/test/attention/test_flashattn_backend.py +259 -221
  111. sglang/test/attention/test_flashattn_mla_backend.py +285 -0
  112. sglang/test/attention/test_prefix_chunk_info.py +224 -0
  113. sglang/test/test_block_fp8.py +57 -0
  114. sglang/test/test_utils.py +19 -8
  115. sglang/version.py +1 -1
  116. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/METADATA +14 -4
  117. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/RECORD +120 -106
  118. sglang/srt/disaggregation/conn.py +0 -81
  119. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/WHEEL +0 -0
  120. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/licenses/LICENSE +0 -0
  121. {sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/top_level.txt +0 -0
@@ -12,15 +12,18 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  import os
15
- from typing import Dict, List, Optional, Tuple, Union
15
+ from typing import Dict, List, Literal, Optional, Tuple, Union
16
16
 
17
17
  import torch
18
18
  import torch.distributed as dist
19
+ from PIL.Image import Image
19
20
  from torch.distributed.tensor import DeviceMesh, DTensor
20
21
 
22
+ from sglang.srt.entrypoints.http_server_engine import HttpServerEngineAdapter
21
23
  from sglang.srt.model_executor.model_runner import LocalSerializedTensor
22
24
  from sglang.srt.patch_torch import monkey_patch_torch_reductions
23
25
  from sglang.srt.server import Engine
26
+ from sglang.srt.server_args import PortArgs, ServerArgs
24
27
  from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj
25
28
 
26
29
 
@@ -29,6 +32,7 @@ class VerlEngine:
29
32
  self,
30
33
  device_mesh_cpu: DeviceMesh,
31
34
  nnodes: int = 1,
35
+ backend: Literal["engine", "server"] = "engine",
32
36
  **kwargs,
33
37
  ):
34
38
  monkey_patch_torch_reductions()
@@ -39,13 +43,25 @@ class VerlEngine:
39
43
  node_rank = self._tp_rank // tp_size_per_node
40
44
  first_rank_in_node = self._tp_rank % tp_size_per_node == 0
41
45
 
42
- if first_rank_in_node:
43
- os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
44
- self._engine = Engine(
45
- **kwargs, tp_size=self._tp_size, node_rank=node_rank, nnodes=nnodes
46
- )
46
+ # Common engine keyword arguments
47
+ engine_kwargs = dict(
48
+ **kwargs, tp_size=self._tp_size, node_rank=node_rank, nnodes=nnodes
49
+ )
50
+
51
+ if backend == "engine":
52
+ if first_rank_in_node:
53
+ os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
54
+ self._engine = Engine(**engine_kwargs)
55
+ else:
56
+ self._engine = None
57
+
58
+ elif backend == "server":
59
+ if self._tp_rank == 0:
60
+ self._engine = HttpServerEngineAdapter(**engine_kwargs)
61
+ else:
62
+ self._engine = None
47
63
  else:
48
- self._engine = None
64
+ raise ValueError(f"Unsupported backend: {backend}")
49
65
 
50
66
  dist.barrier(group=self._device_mesh_cpu.get_group())
51
67
 
@@ -56,9 +72,19 @@ class VerlEngine:
56
72
  sampling_params: Optional[Union[List[Dict], Dict]] = None,
57
73
  # The token ids for text; one can either specify text or input_ids.
58
74
  input_ids: Optional[Union[List[List[int]], List[int]]] = None,
59
- # The image input. It can be a file name, a url, or base64 encoded string.
60
- # See also python/sglang/srt/utils.py:load_image.
61
- image_data: Optional[Union[List[str], str]] = None,
75
+ # The image input. It can be an image instance, file name, URL, or base64 encoded string.
76
+ # Can be formatted as:
77
+ # - Single image for a single request
78
+ # - List of images (one per request in a batch)
79
+ # - List of lists of images (multiple images per request)
80
+ # See also python/sglang/srt/utils.py:load_image for more details.
81
+ image_data: Optional[
82
+ Union[
83
+ List[List[Union[Image, str]]],
84
+ List[Union[Image, str]],
85
+ Union[Image, str],
86
+ ]
87
+ ] = None,
62
88
  return_logprob: Optional[Union[List[bool], bool]] = False,
63
89
  logprob_start_len: Optional[Union[List[int], int]] = None,
64
90
  top_logprobs_num: Optional[Union[List[int], int]] = None,
@@ -92,6 +118,7 @@ class VerlEngine:
92
118
  rank=self._tp_rank,
93
119
  dist_group=self._device_mesh_cpu.get_group(),
94
120
  src=self._device_mesh_cpu.mesh[0].item(),
121
+ force_cpu_device=False,
95
122
  )
96
123
 
97
124
  return output
@@ -215,6 +215,7 @@ def get_processor(
215
215
  tokenizer_mode: str = "auto",
216
216
  trust_remote_code: bool = False,
217
217
  tokenizer_revision: Optional[str] = None,
218
+ use_fast: Optional[bool] = True,
218
219
  **kwargs,
219
220
  ):
220
221
  # pop 'revision' from kwargs if present.
@@ -232,6 +233,9 @@ def get_processor(
232
233
  if "size" not in kwargs:
233
234
  kwargs["size"] = {"shortest_edge": 3136, "longest_edge": 1003520}
234
235
 
236
+ if config.model_type not in {"llava", "clip"}:
237
+ kwargs["use_fast"] = use_fast
238
+
235
239
  processor = AutoProcessor.from_pretrained(
236
240
  tokenizer_name,
237
241
  *args,