sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. sglang/__init__.py +2 -4
  2. sglang/bench_one_batch.py +2 -2
  3. sglang/bench_serving.py +3 -6
  4. sglang/compile_deep_gemm.py +136 -0
  5. sglang/lang/backend/anthropic.py +0 -4
  6. sglang/lang/backend/base_backend.py +1 -1
  7. sglang/lang/backend/openai.py +6 -2
  8. sglang/lang/backend/runtime_endpoint.py +5 -1
  9. sglang/lang/backend/vertexai.py +0 -1
  10. sglang/lang/compiler.py +1 -7
  11. sglang/lang/tracer.py +3 -7
  12. sglang/srt/_custom_ops.py +0 -2
  13. sglang/srt/configs/model_config.py +4 -1
  14. sglang/srt/constrained/outlines_jump_forward.py +14 -1
  15. sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
  16. sglang/srt/constrained/xgrammar_backend.py +27 -4
  17. sglang/srt/custom_op.py +0 -62
  18. sglang/srt/disaggregation/decode.py +105 -6
  19. sglang/srt/disaggregation/mini_lb.py +74 -9
  20. sglang/srt/disaggregation/mooncake/conn.py +33 -63
  21. sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
  22. sglang/srt/disaggregation/nixl/__init__.py +1 -0
  23. sglang/srt/disaggregation/nixl/conn.py +622 -0
  24. sglang/srt/disaggregation/prefill.py +137 -17
  25. sglang/srt/disaggregation/utils.py +32 -0
  26. sglang/srt/entrypoints/engine.py +4 -0
  27. sglang/srt/entrypoints/http_server.py +3 -7
  28. sglang/srt/entrypoints/verl_engine.py +7 -5
  29. sglang/srt/function_call_parser.py +60 -0
  30. sglang/srt/layers/activation.py +6 -8
  31. sglang/srt/layers/attention/flashattention_backend.py +883 -209
  32. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  33. sglang/srt/layers/attention/torch_native_backend.py +6 -1
  34. sglang/srt/layers/attention/triton_backend.py +6 -0
  35. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
  36. sglang/srt/layers/attention/triton_ops/extend_attention.py +18 -7
  37. sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
  38. sglang/srt/layers/dp_attention.py +1 -1
  39. sglang/srt/layers/layernorm.py +20 -5
  40. sglang/srt/layers/linear.py +17 -3
  41. sglang/srt/layers/moe/ep_moe/layer.py +17 -29
  42. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  43. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
  44. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  45. sglang/srt/layers/moe/topk.py +27 -30
  46. sglang/srt/layers/parameter.py +0 -2
  47. sglang/srt/layers/quantization/__init__.py +1 -0
  48. sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  49. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +9 -2
  50. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
  51. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  52. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
  53. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
  54. sglang/srt/layers/quantization/deep_gemm.py +378 -0
  55. sglang/srt/layers/quantization/fp8.py +115 -132
  56. sglang/srt/layers/quantization/fp8_kernel.py +213 -88
  57. sglang/srt/layers/quantization/fp8_utils.py +189 -264
  58. sglang/srt/layers/quantization/gptq.py +13 -7
  59. sglang/srt/layers/quantization/modelopt_quant.py +2 -2
  60. sglang/srt/layers/quantization/moe_wna16.py +2 -0
  61. sglang/srt/layers/quantization/utils.py +5 -11
  62. sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
  63. sglang/srt/layers/quantization/w8a8_int8.py +7 -7
  64. sglang/srt/layers/radix_attention.py +15 -0
  65. sglang/srt/layers/rotary_embedding.py +9 -8
  66. sglang/srt/layers/sampler.py +7 -12
  67. sglang/srt/lora/backend/base_backend.py +18 -2
  68. sglang/srt/lora/backend/flashinfer_backend.py +1 -1
  69. sglang/srt/lora/backend/triton_backend.py +1 -1
  70. sglang/srt/lora/layers.py +1 -1
  71. sglang/srt/lora/lora.py +1 -1
  72. sglang/srt/lora/lora_manager.py +1 -1
  73. sglang/srt/managers/data_parallel_controller.py +7 -1
  74. sglang/srt/managers/detokenizer_manager.py +0 -1
  75. sglang/srt/managers/io_struct.py +15 -3
  76. sglang/srt/managers/mm_utils.py +4 -3
  77. sglang/srt/managers/multimodal_processor.py +0 -2
  78. sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
  79. sglang/srt/managers/schedule_batch.py +15 -4
  80. sglang/srt/managers/scheduler.py +28 -77
  81. sglang/srt/managers/tokenizer_manager.py +116 -29
  82. sglang/srt/managers/tp_worker.py +1 -0
  83. sglang/srt/mem_cache/hiradix_cache.py +41 -29
  84. sglang/srt/mem_cache/memory_pool.py +38 -15
  85. sglang/srt/model_executor/cuda_graph_runner.py +15 -10
  86. sglang/srt/model_executor/model_runner.py +39 -31
  87. sglang/srt/models/bert.py +398 -0
  88. sglang/srt/models/deepseek.py +1 -1
  89. sglang/srt/models/deepseek_nextn.py +74 -70
  90. sglang/srt/models/deepseek_v2.py +292 -348
  91. sglang/srt/models/llama.py +5 -5
  92. sglang/srt/models/minicpm3.py +31 -203
  93. sglang/srt/models/minicpmo.py +17 -6
  94. sglang/srt/models/qwen2.py +4 -1
  95. sglang/srt/models/qwen2_moe.py +14 -13
  96. sglang/srt/models/qwen3.py +335 -0
  97. sglang/srt/models/qwen3_moe.py +423 -0
  98. sglang/srt/openai_api/adapter.py +71 -4
  99. sglang/srt/openai_api/protocol.py +6 -1
  100. sglang/srt/reasoning_parser.py +0 -1
  101. sglang/srt/sampling/sampling_batch_info.py +2 -3
  102. sglang/srt/server_args.py +86 -72
  103. sglang/srt/speculative/build_eagle_tree.py +2 -2
  104. sglang/srt/speculative/eagle_utils.py +2 -2
  105. sglang/srt/speculative/eagle_worker.py +6 -14
  106. sglang/srt/utils.py +62 -6
  107. sglang/test/runners.py +5 -1
  108. sglang/test/test_block_fp8.py +167 -0
  109. sglang/test/test_custom_ops.py +1 -1
  110. sglang/test/test_utils.py +3 -1
  111. sglang/version.py +1 -1
  112. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +5 -5
  113. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +116 -110
  114. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +1 -1
  115. sglang/lang/__init__.py +0 -0
  116. sglang/srt/lora/backend/__init__.py +0 -25
  117. sglang/srt/server.py +0 -18
  118. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
  119. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -26,11 +26,8 @@ from sglang.srt.hf_transformers_utils import check_gguf_file
26
26
  from sglang.srt.reasoning_parser import ReasoningParser
27
27
  from sglang.srt.utils import (
28
28
  configure_ipv6,
29
- get_amdgpu_memory_capacity,
30
29
  get_device,
31
- get_hpu_memory_capacity,
32
- get_nvgpu_memory_capacity,
33
- is_cuda,
30
+ get_device_memory_capacity,
34
31
  is_flashinfer_available,
35
32
  is_hip,
36
33
  is_port_available,
@@ -49,6 +46,7 @@ class ServerArgs:
49
46
  tokenizer_path: Optional[str] = None
50
47
  tokenizer_mode: str = "auto"
51
48
  skip_tokenizer_init: bool = False
49
+ enable_tokenizer_batch_encode: bool = False
52
50
  load_format: str = "auto"
53
51
  trust_remote_code: bool = False
54
52
  dtype: str = "auto"
@@ -155,7 +153,6 @@ class ServerArgs:
155
153
  enable_nccl_nvls: bool = False
156
154
  disable_outlines_disk_cache: bool = False
157
155
  disable_custom_all_reduce: bool = False
158
- disable_mla: bool = False
159
156
  enable_llama4_multimodal: Optional[bool] = None
160
157
  disable_overlap_schedule: bool = False
161
158
  enable_mixed_chunk: bool = False
@@ -180,13 +177,14 @@ class ServerArgs:
180
177
  tool_call_parser: Optional[str] = None
181
178
  enable_hierarchical_cache: bool = False
182
179
  hicache_ratio: float = 2.0
183
- enable_flashinfer_mla: bool = False # TODO: remove this argument
184
- enable_flashmla: bool = False
180
+ hicache_size: int = 0
181
+ hicache_write_policy: str = "write_through_selective"
185
182
  flashinfer_mla_disable_ragged: bool = False
186
183
  warmups: Optional[str] = None
184
+ moe_dense_tp_size: Optional[int] = None
187
185
  n_share_experts_fusion: int = 0
188
- disable_shared_experts_fusion: bool = False
189
186
  disable_chunked_prefix_cache: bool = False
187
+ disable_fast_image_processor: bool = False
190
188
 
191
189
  # Debug tensor dumps
192
190
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -197,9 +195,7 @@ class ServerArgs:
197
195
  disaggregation_mode: str = "null"
198
196
  disaggregation_bootstrap_port: int = 8998
199
197
  disaggregation_transfer_backend: str = "mooncake"
200
-
201
- # multimodal
202
- disable_fast_image_processor: bool = False
198
+ disaggregation_ib_device: Optional[str] = None
203
199
 
204
200
  def __post_init__(self):
205
201
  # Expert parallelism
@@ -222,31 +218,24 @@ class ServerArgs:
222
218
  if self.random_seed is None:
223
219
  self.random_seed = random.randint(0, 1 << 30)
224
220
 
225
- if is_cuda():
226
- gpu_mem = get_nvgpu_memory_capacity()
227
- elif is_hip():
228
- gpu_mem = get_amdgpu_memory_capacity()
229
- elif self.device == "hpu":
230
- gpu_mem = get_hpu_memory_capacity()
231
- else:
232
- # GPU memory is not known yet or no GPU is available.
233
- gpu_mem = None
234
-
235
- if is_hip():
236
- self.disable_shared_experts_fusion = True
221
+ gpu_mem = get_device_memory_capacity(self.device)
237
222
 
238
223
  # Set mem fraction static, which depends on the tensor parallelism size
239
224
  if self.mem_fraction_static is None:
240
- if self.tp_size >= 16:
241
- self.mem_fraction_static = 0.79
242
- elif self.tp_size >= 8:
243
- self.mem_fraction_static = 0.81
244
- elif self.tp_size >= 4:
245
- self.mem_fraction_static = 0.85
246
- elif self.tp_size >= 2:
247
- self.mem_fraction_static = 0.87
225
+ if gpu_mem <= 81920:
226
+ if self.tp_size >= 16:
227
+ self.mem_fraction_static = 0.79
228
+ elif self.tp_size >= 8:
229
+ self.mem_fraction_static = 0.81
230
+ elif self.tp_size >= 4:
231
+ self.mem_fraction_static = 0.85
232
+ elif self.tp_size >= 2:
233
+ self.mem_fraction_static = 0.87
234
+ else:
235
+ self.mem_fraction_static = 0.88
248
236
  else:
249
- self.mem_fraction_static = 0.88
237
+ # FIXME: more fine grained auto-selection polices
238
+ self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
250
239
 
251
240
  # Set chunked prefill size, which depends on the gpu memory capacity
252
241
  if self.chunked_prefill_size is None:
@@ -257,7 +246,12 @@ class ServerArgs:
257
246
 
258
247
  assert self.chunked_prefill_size % self.page_size == 0
259
248
 
260
- if self.enable_flashmla is True:
249
+ assert self.moe_dense_tp_size in {
250
+ 1,
251
+ None,
252
+ }, f"moe_dense_tp_size only support 1 and None currently"
253
+
254
+ if self.attention_backend == "flashmla":
261
255
  logger.warning(
262
256
  "FlashMLA only supports a page_size of 64, change page_size to 64."
263
257
  )
@@ -270,8 +264,6 @@ class ServerArgs:
270
264
  self.cuda_graph_max_bs = 8
271
265
  else:
272
266
  self.cuda_graph_max_bs = 80
273
- else:
274
- self.cuda_graph_max_bs = 160
275
267
 
276
268
  # Set kernel backends for hpu device
277
269
  if self.device == "hpu":
@@ -293,13 +285,6 @@ class ServerArgs:
293
285
  if self.grammar_backend is None:
294
286
  self.grammar_backend = "xgrammar"
295
287
 
296
- # Expert parallelism
297
- if self.enable_ep_moe:
298
- self.ep_size = self.tp_size
299
- logger.info(
300
- f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
301
- )
302
-
303
288
  self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
304
289
 
305
290
  # Data parallelism attention
@@ -360,7 +345,18 @@ class ServerArgs:
360
345
 
361
346
  if self.page_size > 1 and self.speculative_eagle_topk > 1:
362
347
  self.speculative_eagle_topk = 1
363
- logger.info("speculative_eagle_topk is changed to 1 when page_size > 1")
348
+ logger.info(
349
+ "speculative_eagle_topk is adjusted to 1 when page_size > 1"
350
+ )
351
+
352
+ if (
353
+ self.speculative_eagle_topk == 1
354
+ and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
355
+ ):
356
+ logger.info(
357
+ "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
358
+ )
359
+ self.speculative_num_draft_tokens = self.speculative_num_steps + 1
364
360
 
365
361
  # The token generated from the verify step is counted.
366
362
  # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
@@ -382,18 +378,18 @@ class ServerArgs:
382
378
  # PD disaggregation
383
379
  if self.disaggregation_mode == "prefill":
384
380
  self.disable_cuda_graph = True
385
- logger.warning("KV cache is forced as chunk cache for decode server")
386
- self.disable_overlap_schedule = True
387
- logger.warning("Overlap scheduler is disabled for prefill server")
381
+ logger.warning("Cuda graph is disabled for prefill server")
388
382
  elif self.disaggregation_mode == "decode":
389
383
  self.disable_radix_cache = True
390
- logger.warning("Cuda graph is disabled for prefill server")
391
- self.disable_overlap_schedule = True
392
- logger.warning("Overlap scheduler is disabled for decode server")
384
+ logger.warning("KV cache is forced as chunk cache for decode server")
393
385
 
394
386
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
395
387
  "1" if self.enable_torch_compile else "0"
396
388
  )
389
+ # Set env var before grammar backends init
390
+ os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
391
+ "1" if self.disable_outlines_disk_cache else "0"
392
+ )
397
393
 
398
394
  @staticmethod
399
395
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -430,6 +426,11 @@ class ServerArgs:
430
426
  action="store_true",
431
427
  help="If set, skip init tokenizer and pass input_ids in generate request",
432
428
  )
429
+ parser.add_argument(
430
+ "--enable-tokenizer-batch-encode",
431
+ action="store_true",
432
+ help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
433
+ )
433
434
  parser.add_argument(
434
435
  "--load-format",
435
436
  type=str,
@@ -826,7 +827,7 @@ class ServerArgs:
826
827
  parser.add_argument(
827
828
  "--attention-backend",
828
829
  type=str,
829
- choices=["flashinfer", "triton", "torch_native", "fa3"],
830
+ choices=["flashinfer", "triton", "torch_native", "fa3", "flashmla"],
830
831
  default=ServerArgs.attention_backend,
831
832
  help="Choose the kernels for attention layers.",
832
833
  )
@@ -846,13 +847,13 @@ class ServerArgs:
846
847
  )
847
848
  parser.add_argument(
848
849
  "--enable-flashinfer-mla",
849
- action="store_true",
850
- help="Enable FlashInfer MLA optimization. This argument will be deprecated soon! Please use '--attention-backend flashinfer' instead for switching on flashfiner mla!",
850
+ action=DeprecatedAction,
851
+ help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
851
852
  )
852
853
  parser.add_argument(
853
854
  "--enable-flashmla",
854
- action="store_true",
855
- help="Enable FlashMLA decode optimization",
855
+ action=DeprecatedAction,
856
+ help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
856
857
  )
857
858
  parser.add_argument(
858
859
  "--flashinfer-mla-disable-ragged",
@@ -977,11 +978,6 @@ class ServerArgs:
977
978
  action="store_true",
978
979
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
979
980
  )
980
- parser.add_argument(
981
- "--disable-mla",
982
- action="store_true",
983
- help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
984
- )
985
981
  parser.add_argument(
986
982
  "--enable-llama4-multimodal",
987
983
  default=ServerArgs.enable_llama4_multimodal,
@@ -1090,7 +1086,7 @@ class ServerArgs:
1090
1086
  parser.add_argument(
1091
1087
  "--tool-call-parser",
1092
1088
  type=str,
1093
- choices=["qwen25", "mistral", "llama3"],
1089
+ choices=["qwen25", "mistral", "llama3", "deepseekv3"],
1094
1090
  default=ServerArgs.tool_call_parser,
1095
1091
  help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
1096
1092
  )
@@ -1102,15 +1098,33 @@ class ServerArgs:
1102
1098
  parser.add_argument(
1103
1099
  "--hicache-ratio",
1104
1100
  type=float,
1105
- required=False,
1106
1101
  default=ServerArgs.hicache_ratio,
1107
1102
  help="The ratio of the size of host KV cache memory pool to the size of device pool.",
1108
1103
  )
1104
+ parser.add_argument(
1105
+ "--hicache-size",
1106
+ type=int,
1107
+ default=ServerArgs.hicache_size,
1108
+ help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
1109
+ )
1110
+ parser.add_argument(
1111
+ "--hicache-write-policy",
1112
+ type=str,
1113
+ choices=["write_back", "write_through", "write_through_selective"],
1114
+ default=ServerArgs.hicache_write_policy,
1115
+ help="The write policy of hierarchical cache.",
1116
+ )
1109
1117
  parser.add_argument(
1110
1118
  "--enable-deepep-moe",
1111
1119
  action="store_true",
1112
1120
  help="Enabling DeepEP MoE implementation for EP MoE.",
1113
1121
  )
1122
+ parser.add_argument(
1123
+ "--moe-dense-tp-size",
1124
+ type=int,
1125
+ default=ServerArgs.moe_dense_tp_size,
1126
+ help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1127
+ )
1114
1128
  parser.add_argument(
1115
1129
  "--deepep-mode",
1116
1130
  type=str,
@@ -1123,18 +1137,18 @@ class ServerArgs:
1123
1137
  "--n-share-experts-fusion",
1124
1138
  type=int,
1125
1139
  default=0,
1126
- help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1 "
1127
- "we use tp_size by default.",
1140
+ help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
1141
+ "set it to tp_size can get best optimized performace.",
1128
1142
  )
1129
1143
  parser.add_argument(
1130
- "--disable-shared-experts-fusion",
1144
+ "--disable-chunked-prefix-cache",
1131
1145
  action="store_true",
1132
- help="Disable shared experts fusion by setting n_share_experts_fusion to 0.",
1146
+ help="Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences.",
1133
1147
  )
1134
1148
  parser.add_argument(
1135
- "--disable-chunked-prefix-cache",
1149
+ "--disable-fast-image-processor",
1136
1150
  action="store_true",
1137
- help="Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences.",
1151
+ help="Adopt base image processor instead of fast image processor.",
1138
1152
  )
1139
1153
 
1140
1154
  # Server warmups
@@ -1184,14 +1198,14 @@ class ServerArgs:
1184
1198
  "--disaggregation-transfer-backend",
1185
1199
  type=str,
1186
1200
  default=ServerArgs.disaggregation_transfer_backend,
1201
+ choices=["mooncake", "nixl"],
1187
1202
  help="The backend for disaggregation transfer. Default is mooncake.",
1188
1203
  )
1189
-
1190
- # Multimodal
1191
1204
  parser.add_argument(
1192
- "--disable-fast-image-processor",
1193
- action="store_true",
1194
- help="Adopt base image processor instead of fast image processor.",
1205
+ "--disaggregation-ib-device",
1206
+ type=str,
1207
+ default=ServerArgs.disaggregation_ib_device,
1208
+ help="The ib device for disaggregation transfer. Default is None, it will be detected automatically if using the mooncake backend.",
1195
1209
  )
1196
1210
 
1197
1211
  @classmethod
@@ -4,9 +4,9 @@ from typing import List
4
4
 
5
5
  import torch
6
6
 
7
- from sglang.srt.utils import is_cuda_available, is_hip
7
+ from sglang.srt.utils import is_cuda, is_hip
8
8
 
9
- if is_cuda_available() or is_hip():
9
+ if is_cuda() or is_hip():
10
10
  from sgl_kernel import (
11
11
  build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
12
12
  )
@@ -19,9 +19,9 @@ from sglang.srt.managers.schedule_batch import (
19
19
  from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
20
20
  from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
21
21
  from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
22
- from sglang.srt.utils import fast_topk, is_cuda_available, is_hip, next_power_of_2
22
+ from sglang.srt.utils import fast_topk, is_cuda, is_hip, next_power_of_2
23
23
 
24
- if is_cuda_available():
24
+ if is_cuda():
25
25
  from sgl_kernel import (
26
26
  top_k_renorm_prob,
27
27
  top_p_renorm_prob,
@@ -34,14 +34,9 @@ from sglang.srt.speculative.eagle_utils import (
34
34
  select_top_k_tokens,
35
35
  )
36
36
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
37
- from sglang.srt.utils import (
38
- empty_context,
39
- fast_topk,
40
- get_available_gpu_memory,
41
- is_cuda_available,
42
- )
37
+ from sglang.srt.utils import empty_context, fast_topk, get_available_gpu_memory, is_cuda
43
38
 
44
- if is_cuda_available():
39
+ if is_cuda():
45
40
  from sgl_kernel import segment_packbits
46
41
 
47
42
  logger = logging.getLogger(__name__)
@@ -271,14 +266,11 @@ class EAGLEWorker(TpModelWorker):
271
266
  )
272
267
  elif batch.forward_mode.is_idle():
273
268
  model_worker_batch = batch.get_model_worker_batch()
274
- logits_output, next_token_ids, _ = (
275
- self.target_worker.forward_batch_generation(
276
- ForwardBatch.init_new(
277
- model_worker_batch, self.target_worker.model_runner
278
- )
279
- )
269
+ logits_output, next_token_ids = self.target_worker.forward_batch_generation(
270
+ model_worker_batch
280
271
  )
281
- return logits_output, next_token_ids, model_worker_batch.bid, 0, False
272
+
273
+ return logits_output, next_token_ids, model_worker_batch.bid, 0
282
274
  else:
283
275
  logits_output, next_token_ids, bid = self.forward_target_extend(batch)
284
276
  with self.draft_tp_context(self.draft_model_runner.tp_group):
sglang/srt/utils.py CHANGED
@@ -55,7 +55,6 @@ import torch.distributed
55
55
  import torch.distributed as dist
56
56
  import triton
57
57
  import zmq
58
- from decord import VideoReader, cpu
59
58
  from fastapi.responses import ORJSONResponse
60
59
  from packaging import version as pkg_version
61
60
  from PIL import Image
@@ -79,10 +78,34 @@ time_infos = {}
79
78
 
80
79
  HIP_FP8_E4M3_FNUZ_MAX = 224.0
81
80
 
81
+ _warned_bool_env_var_keys = set()
82
+
82
83
 
83
84
  def get_bool_env_var(name: str, default: str = "false") -> bool:
84
85
  value = os.getenv(name, default)
85
- return value.lower() in ("true", "1")
86
+ value = value.lower()
87
+
88
+ truthy_values = ("true", "1")
89
+ falsy_values = ("false", "0")
90
+
91
+ if (value not in truthy_values) and (value not in falsy_values):
92
+ if value not in _warned_bool_env_var_keys:
93
+ logger.warning(
94
+ f"get_bool_env_var({name}) see non-understandable value={value} and treat as false"
95
+ )
96
+ _warned_bool_env_var_keys.add(value)
97
+
98
+ return value in truthy_values
99
+
100
+
101
+ def get_int_env_var(name: str, default: int = 0) -> int:
102
+ value = os.getenv(name)
103
+ if value is None or not value.strip():
104
+ return default
105
+ try:
106
+ return int(value)
107
+ except ValueError:
108
+ return default
86
109
 
87
110
 
88
111
  # https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
@@ -131,10 +154,6 @@ def is_flashinfer_available():
131
154
  return importlib.util.find_spec("flashinfer") is not None and is_cuda()
132
155
 
133
156
 
134
- def is_cuda_available():
135
- return is_cuda()
136
-
137
-
138
157
  _ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(
139
158
  "SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false"
140
159
  )
@@ -545,6 +564,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
545
564
 
546
565
 
547
566
  def encode_video(video_path, frame_count_limit=None):
567
+ # Lazy import because decord is not available on some arm platforms.
568
+ from decord import VideoReader, cpu
569
+
548
570
  if not os.path.exists(video_path):
549
571
  logger.error(f"Video {video_path} does not exist")
550
572
  return []
@@ -772,6 +794,8 @@ def add_api_key_middleware(app, api_key: str):
772
794
  return await call_next(request)
773
795
  if request.url.path.startswith("/health"):
774
796
  return await call_next(request)
797
+ if request.url.path.startswith("/metrics"):
798
+ return await call_next(request)
775
799
  if request.headers.get("Authorization") != "Bearer " + api_key:
776
800
  return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
777
801
  return await call_next(request)
@@ -928,6 +952,8 @@ def get_zmq_socket(
928
952
  buf_size = -1
929
953
 
930
954
  socket = context.socket(socket_type)
955
+ if endpoint.find("[") != -1:
956
+ socket.setsockopt(zmq.IPV6, 1)
931
957
 
932
958
  def set_send_opt():
933
959
  socket.setsockopt(zmq.SNDHWM, 0)
@@ -1144,6 +1170,20 @@ def get_hpu_memory_capacity():
1144
1170
  )
1145
1171
 
1146
1172
 
1173
+ def get_device_memory_capacity(device: str = None):
1174
+ if is_cuda():
1175
+ gpu_mem = get_nvgpu_memory_capacity()
1176
+ elif is_hip():
1177
+ gpu_mem = get_amdgpu_memory_capacity()
1178
+ elif device == "hpu":
1179
+ gpu_mem = get_hpu_memory_capacity()
1180
+ else:
1181
+ # GPU memory is not known yet or no GPU is available.
1182
+ gpu_mem = None
1183
+
1184
+ return gpu_mem
1185
+
1186
+
1147
1187
  # Copy from pytorch and OpenRLHF to allow creating multiple main groups.
1148
1188
  # https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
1149
1189
  # https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
@@ -1911,6 +1951,8 @@ def is_page_size_one(server_args):
1911
1951
  return server_args.page_size == 1
1912
1952
 
1913
1953
 
1954
+ # TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
1955
+ # TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
1914
1956
  def is_no_spec_infer_or_topk_one(server_args):
1915
1957
  return server_args.speculative_eagle_topk is None or (
1916
1958
  server_args.speculative_eagle_topk is not None
@@ -1928,5 +1970,19 @@ def is_fa3_default_architecture(hf_config):
1928
1970
  "Llama4ForConditionalGeneration",
1929
1971
  "LlamaForCausalLM",
1930
1972
  "MistralForCausalLM",
1973
+ "Gemma2ForCausalLM",
1931
1974
  }
1932
1975
  return architectures[0] in default_archs
1976
+
1977
+
1978
+ # Can be more general if it is used in multiple places (keep it simple and thus not general now)
1979
+ class BumpAllocator:
1980
+ def __init__(self, buffer_size: int, dtype, device):
1981
+ self._buffer = torch.zeros((buffer_size,), dtype=dtype, device=device)
1982
+ self._pointer = 0
1983
+
1984
+ def allocate(self, size: int):
1985
+ assert self._pointer + size <= len(self._buffer)
1986
+ output = self._buffer[self._pointer : self._pointer + size]
1987
+ self._pointer += size
1988
+ return output
sglang/test/runners.py CHANGED
@@ -26,8 +26,8 @@ from transformers import (
26
26
  AutoProcessor,
27
27
  )
28
28
 
29
+ from sglang.srt.entrypoints.engine import Engine
29
30
  from sglang.srt.hf_transformers_utils import get_tokenizer
30
- from sglang.srt.server import Engine
31
31
  from sglang.srt.utils import load_image
32
32
  from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
33
33
 
@@ -51,6 +51,8 @@ NUM_TOP_LOGPROBS = 5
51
51
  def get_dtype_str(torch_dtype):
52
52
  if torch_dtype is torch.float16:
53
53
  return "float16"
54
+ if torch_dtype is torch.float32:
55
+ return "float32"
54
56
  else:
55
57
  raise NotImplementedError()
56
58
 
@@ -447,6 +449,7 @@ class SRTRunner:
447
449
  port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
448
450
  lora_paths: List[str] = None,
449
451
  max_loras_per_batch: int = 4,
452
+ attention_backend: Optional[str] = None,
450
453
  lora_backend: str = "triton",
451
454
  disable_cuda_graph: bool = False,
452
455
  disable_radix_cache: bool = False,
@@ -487,6 +490,7 @@ class SRTRunner:
487
490
  lora_paths=lora_paths,
488
491
  max_loras_per_batch=max_loras_per_batch,
489
492
  lora_backend=lora_backend,
493
+ attention_backend=attention_backend,
490
494
  disable_cuda_graph=disable_cuda_graph,
491
495
  disable_radix_cache=disable_radix_cache,
492
496
  chunked_prefill_size=chunked_prefill_size,