sglang 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. sglang/bench_serving.py +3 -2
  2. sglang/compile_deep_gemm.py +136 -0
  3. sglang/lang/backend/openai.py +5 -1
  4. sglang/lang/backend/runtime_endpoint.py +5 -1
  5. sglang/srt/configs/model_config.py +4 -1
  6. sglang/srt/constrained/xgrammar_backend.py +1 -0
  7. sglang/srt/disaggregation/decode.py +43 -0
  8. sglang/srt/disaggregation/mini_lb.py +69 -8
  9. sglang/srt/disaggregation/mooncake/conn.py +1 -1
  10. sglang/srt/disaggregation/nixl/__init__.py +1 -0
  11. sglang/srt/disaggregation/nixl/conn.py +622 -0
  12. sglang/srt/disaggregation/prefill.py +100 -16
  13. sglang/srt/disaggregation/utils.py +17 -0
  14. sglang/srt/entrypoints/engine.py +4 -0
  15. sglang/srt/entrypoints/http_server.py +3 -7
  16. sglang/srt/function_call_parser.py +60 -0
  17. sglang/srt/layers/activation.py +2 -2
  18. sglang/srt/layers/attention/flashattention_backend.py +781 -150
  19. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
  20. sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -5
  21. sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
  22. sglang/srt/layers/dp_attention.py +1 -1
  23. sglang/srt/layers/layernorm.py +19 -4
  24. sglang/srt/layers/moe/ep_moe/layer.py +2 -0
  25. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  26. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  27. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
  28. sglang/srt/layers/quantization/deep_gemm.py +378 -0
  29. sglang/srt/layers/quantization/fp8_kernel.py +7 -38
  30. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  31. sglang/srt/layers/quantization/gptq.py +13 -7
  32. sglang/srt/layers/quantization/modelopt_quant.py +2 -2
  33. sglang/srt/layers/quantization/w8a8_int8.py +3 -3
  34. sglang/srt/layers/rotary_embedding.py +6 -6
  35. sglang/srt/layers/sampler.py +2 -2
  36. sglang/srt/managers/data_parallel_controller.py +7 -1
  37. sglang/srt/managers/io_struct.py +14 -3
  38. sglang/srt/managers/schedule_batch.py +13 -0
  39. sglang/srt/managers/scheduler.py +16 -6
  40. sglang/srt/managers/tokenizer_manager.py +115 -29
  41. sglang/srt/managers/tp_worker.py +1 -0
  42. sglang/srt/mem_cache/hiradix_cache.py +40 -32
  43. sglang/srt/mem_cache/memory_pool.py +31 -13
  44. sglang/srt/model_executor/cuda_graph_runner.py +13 -8
  45. sglang/srt/model_executor/model_runner.py +19 -4
  46. sglang/srt/models/deepseek_v2.py +9 -6
  47. sglang/srt/models/minicpm3.py +2 -2
  48. sglang/srt/models/minicpmo.py +17 -6
  49. sglang/srt/openai_api/adapter.py +71 -4
  50. sglang/srt/openai_api/protocol.py +6 -1
  51. sglang/srt/server_args.py +52 -40
  52. sglang/srt/speculative/build_eagle_tree.py +2 -2
  53. sglang/srt/speculative/eagle_utils.py +2 -2
  54. sglang/srt/speculative/eagle_worker.py +2 -7
  55. sglang/srt/utils.py +46 -5
  56. sglang/test/test_utils.py +3 -1
  57. sglang/version.py +1 -1
  58. {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +3 -3
  59. {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +62 -57
  60. {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +0 -0
  61. {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
  62. {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0
@@ -42,6 +42,10 @@ from sglang.srt.layers.dp_attention import (
42
42
  )
43
43
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
44
44
  from sglang.srt.layers.quantization import monkey_patch_isinstance_for_vllm_base_layer
45
+ from sglang.srt.layers.quantization.deep_gemm import (
46
+ _ENABLE_JIT_DEEPGEMM,
47
+ update_deep_gemm_config,
48
+ )
45
49
  from sglang.srt.layers.sampler import Sampler
46
50
  from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
47
51
  from sglang.srt.lora.lora_manager import LoRAManager
@@ -169,6 +173,10 @@ class ModelRunner:
169
173
  # Get memory before model loading
170
174
  min_per_gpu_memory = self.init_torch_distributed()
171
175
 
176
+ # Update deep gemm configure
177
+ if _ENABLE_JIT_DEEPGEMM:
178
+ update_deep_gemm_config(gpu_id, server_args)
179
+
172
180
  # If it is a draft model tp_group can be different.
173
181
  self.initialize(min_per_gpu_memory)
174
182
 
@@ -221,7 +229,16 @@ class ModelRunner:
221
229
  server_args = self.server_args
222
230
 
223
231
  if server_args.attention_backend is None:
224
- # By default, use flashinfer for non-mla attention and triton for mla attention
232
+ """
233
+ We auto select the fastest attention backend according to the current offering
234
+ 1. Models with MHA Architecture (e.g: Llama, QWen)
235
+ 1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
236
+ 1.2 In other cases, we will use flashinfer if available, otherwise use triton.
237
+ 2. Models with MLA Architecture and using FA3
238
+ 2.1 We will use FA3 backend on hopper.
239
+ 2.2 Otherwise, we will use triton backend.
240
+ """
241
+
225
242
  if not self.use_mla_backend:
226
243
  if (
227
244
  is_hopper_with_cuda_12_3()
@@ -234,9 +251,7 @@ class ModelRunner:
234
251
  "flashinfer" if is_flashinfer_available() else "triton"
235
252
  )
236
253
  else:
237
- if is_hopper_with_cuda_12_3() and is_no_spec_infer_or_topk_one(
238
- server_args
239
- ):
254
+ if is_hopper_with_cuda_12_3():
240
255
  server_args.attention_backend = "fa3"
241
256
  else:
242
257
  server_args.attention_backend = "triton"
@@ -57,8 +57,8 @@ from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
57
57
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
58
58
  from sglang.srt.layers.moe.topk import select_experts
59
59
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
60
+ from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
60
61
  from sglang.srt.layers.quantization.fp8_kernel import (
61
- _enable_jit_deepgemm_bmm,
62
62
  per_tensor_quant_mla_deep_gemm_masked_fp8,
63
63
  per_tensor_quant_mla_fp8,
64
64
  )
@@ -86,8 +86,11 @@ _is_hip = is_hip()
86
86
  _is_cuda = is_cuda()
87
87
 
88
88
  if _is_cuda:
89
- from deep_gemm import m_grouped_gemm_fp8_fp8_bf16_nt_masked
90
89
  from sgl_kernel import awq_dequantize, bmm_fp8, merge_state_v2
90
+
91
+ from sglang.srt.layers.quantization.deep_gemm import (
92
+ grouped_gemm_nt_f8f8bf16_masked as deep_gemm_grouped_gemm_nt_f8f8bf16_masked,
93
+ )
91
94
  else:
92
95
  from vllm._custom_ops import awq_dequantize
93
96
 
@@ -702,7 +705,7 @@ class DeepseekV2AttentionMLA(nn.Module):
702
705
  q_nope_out = q_nope.new_empty(
703
706
  (self.num_local_heads, aligned_m, self.kv_lora_rank)
704
707
  )
705
- m_grouped_gemm_fp8_fp8_bf16_nt_masked(
708
+ deep_gemm_grouped_gemm_nt_f8f8bf16_masked(
706
709
  (q_nope_val, q_nope_scale),
707
710
  (self.w_kc, self.w_scale_k),
708
711
  q_nope_out,
@@ -751,7 +754,7 @@ class DeepseekV2AttentionMLA(nn.Module):
751
754
  attn_bmm_output = attn_output.new_empty(
752
755
  (self.num_local_heads, aligned_m, self.v_head_dim)
753
756
  )
754
- m_grouped_gemm_fp8_fp8_bf16_nt_masked(
757
+ deep_gemm_grouped_gemm_nt_f8f8bf16_masked(
755
758
  (attn_output_val, attn_output_scale),
756
759
  (self.w_vc, self.w_scale_v),
757
760
  attn_bmm_output,
@@ -1520,7 +1523,7 @@ class DeepseekV2ForCausalLM(nn.Module):
1520
1523
 
1521
1524
  if (
1522
1525
  _is_cuda
1523
- and _enable_jit_deepgemm_bmm
1526
+ and _ENABLE_JIT_DEEPGEMM
1524
1527
  and weight_block_size[0] == 128
1525
1528
  and weight_block_size[1] == 128
1526
1529
  and model_dtype == torch.bfloat16
@@ -1628,7 +1631,7 @@ class DeepseekV2ForCausalLM(nn.Module):
1628
1631
  f"mlp.experts."
1629
1632
  f"{self.config.n_routed_experts + num_repeat}"
1630
1633
  f".{suffix}",
1631
- weights_dict[shared_expert_weight_name].clone(),
1634
+ weights_dict[shared_expert_weight_name],
1632
1635
  )
1633
1636
  )
1634
1637
  names_to_remove += [shared_expert_weight_name]
@@ -40,9 +40,9 @@ from sglang.srt.layers.vocab_parallel_embedding import (
40
40
  from sglang.srt.managers.schedule_batch import global_server_args_dict
41
41
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
42
42
  from sglang.srt.model_loader.weight_utils import default_weight_loader
43
- from sglang.srt.utils import add_prefix, is_cuda_available
43
+ from sglang.srt.utils import add_prefix, is_cuda
44
44
 
45
- if is_cuda_available():
45
+ if is_cuda():
46
46
  from sgl_kernel import bmm_fp8
47
47
 
48
48
 
@@ -25,7 +25,7 @@ import torch.nn.functional as F
25
25
  import torch.nn.utils.parametrize as P
26
26
  import torch.types
27
27
  from torch import nn
28
- from torch.nn.utils import weight_norm
28
+ from torch.nn.utils import parametrizations
29
29
  from tqdm import tqdm
30
30
  from transformers import LlamaConfig, LlamaModel, PretrainedConfig, PreTrainedModel
31
31
  from transformers.activations import ACT2FN
@@ -585,7 +585,7 @@ class ConditionalChatTTS(PreTrainedModel):
585
585
  self.emb_text = nn.Embedding(config.num_text_tokens, config.hidden_size)
586
586
  self.head_code = nn.ModuleList(
587
587
  [
588
- weight_norm(
588
+ parametrizations.weight_norm(
589
589
  nn.Linear(config.hidden_size, config.num_audio_tokens, bias=False),
590
590
  name="weight",
591
591
  )
@@ -1859,11 +1859,22 @@ class MiniCPMO(MiniCPMBaseModel):
1859
1859
  # the checkpoint. Skip them.
1860
1860
  continue
1861
1861
 
1862
- # adapt to parametrization
1862
+ # For weight_norm parametrization, handle both old and new formats
1863
1863
  if self.config.init_tts and "tts" in name:
1864
- name = name.replace(".parametrizations", "")
1865
- name = name.replace(".weight.original0", ".weight_g")
1866
- name = name.replace(".weight.original1", ".weight_v")
1864
+ # Handle loading from older checkpoints with weight_g/weight_v format
1865
+ if ".weight_g" in name or ".weight_v" in name:
1866
+ name = name.replace(
1867
+ ".weight_g", ".parametrizations.weight.original0"
1868
+ )
1869
+ name = name.replace(
1870
+ ".weight_v", ".parametrizations.weight.original1"
1871
+ )
1872
+ elif ".weight" in name and name not in params_dict:
1873
+ param_name = name.replace(
1874
+ ".weight", ".parametrizations.weight.original0"
1875
+ )
1876
+ if param_name in params_dict:
1877
+ name = param_name
1867
1878
 
1868
1879
  # adapt to VisionAttention
1869
1880
  if "vpm" in name:
@@ -938,6 +938,35 @@ def v1_chat_generate_request(
938
938
 
939
939
  if chat_template_name is None:
940
940
  openai_compatible_messages = []
941
+ if (
942
+ tools
943
+ and tokenizer_manager.server_args.tool_call_parser == "deepseekv3"
944
+ ):
945
+ # add function call prompt to deepseekv3
946
+ openai_compatible_messages.append(
947
+ {
948
+ "role": "system",
949
+ "content": """You are a helpful Assistant.
950
+ ## Tools
951
+ ### Function
952
+ You have the following functions available:
953
+ """
954
+ + "".join(
955
+ [
956
+ f"""
957
+ - `{tool['name']}`:
958
+ ```json
959
+ {json.dumps(tool)}
960
+ ```
961
+ """
962
+ for tool in tools
963
+ ]
964
+ ),
965
+ }
966
+ )
967
+ # TODO fix the compatible issues with xgrammar
968
+ strict_tag = None
969
+
941
970
  for message in request.messages:
942
971
  if isinstance(message.content, str):
943
972
  openai_compatible_messages.append(
@@ -950,9 +979,16 @@ def v1_chat_generate_request(
950
979
  openai_compatible_messages.append(
951
980
  {"role": message.role, "content": content["text"]}
952
981
  )
953
- if openai_compatible_messages[-1]["role"] == "assistant":
954
- assistant_prefix = openai_compatible_messages[-1]["content"]
955
- openai_compatible_messages = openai_compatible_messages[:-1]
982
+ if (
983
+ openai_compatible_messages
984
+ and openai_compatible_messages[-1]["role"] == "assistant"
985
+ ):
986
+ if request.continue_final_message:
987
+ # Remove the final assistant message so its content can be continued.
988
+ assistant_prefix = openai_compatible_messages[-1]["content"]
989
+ openai_compatible_messages = openai_compatible_messages[:-1]
990
+ else:
991
+ assistant_prefix = None
956
992
  else:
957
993
  assistant_prefix = None
958
994
 
@@ -991,7 +1027,33 @@ def v1_chat_generate_request(
991
1027
  modalities = []
992
1028
  else:
993
1029
  conv = generate_chat_conv(request, chat_template_name)
994
- prompt = conv.get_prompt()
1030
+ # If we should continue the final assistant message, adjust the conversation.
1031
+ if (
1032
+ request.continue_final_message
1033
+ and request.messages
1034
+ and request.messages[-1].role == "assistant"
1035
+ ):
1036
+ # Remove the auto-added blank assistant turn, if present.
1037
+ if conv.messages and conv.messages[-1][1] is None:
1038
+ conv.messages.pop()
1039
+ # Rebuild the prompt from the conversation.
1040
+ prompt = conv.get_prompt()
1041
+ # Strip any trailing stop tokens or separators that indicate end-of-assistant.
1042
+ if isinstance(conv.stop_str, list):
1043
+ for stop_token in conv.stop_str:
1044
+ if prompt.endswith(stop_token):
1045
+ prompt = prompt[: -len(stop_token)]
1046
+ elif isinstance(conv.stop_str, str) and prompt.endswith(
1047
+ conv.stop_str
1048
+ ):
1049
+ prompt = prompt[: -len(conv.stop_str)]
1050
+ if conv.sep and prompt.endswith(conv.sep):
1051
+ prompt = prompt[: -len(conv.sep)]
1052
+ if getattr(conv, "sep2", None) and prompt.endswith(conv.sep2):
1053
+ prompt = prompt[: -len(conv.sep2)]
1054
+ else:
1055
+ prompt = conv.get_prompt()
1056
+
995
1057
  image_data = conv.image_data
996
1058
  audio_data = conv.audio_data
997
1059
  modalities = conv.modalities
@@ -1003,6 +1065,7 @@ def v1_chat_generate_request(
1003
1065
  else:
1004
1066
  stop.extend(request.stop)
1005
1067
  prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
1068
+
1006
1069
  else:
1007
1070
  # Use the raw prompt and stop strings if the messages is already a string.
1008
1071
  prompt_ids = request.messages
@@ -1042,6 +1105,8 @@ def v1_chat_generate_request(
1042
1105
  sampling_params["json_schema"] = convert_json_schema_to_str(
1043
1106
  request.response_format.json_schema.schema_
1044
1107
  )
1108
+ elif request.response_format and request.response_format.type == "json_object":
1109
+ sampling_params["json_schema"] = '{"type": "object"}'
1045
1110
  elif (
1046
1111
  request.response_format and request.response_format.type == "structural_tag"
1047
1112
  ):
@@ -1109,6 +1174,8 @@ def v1_chat_generate_request(
1109
1174
  rid=request_ids,
1110
1175
  modalities=modalities_list,
1111
1176
  lora_path=lora_paths,
1177
+ bootstrap_host=all_requests[0].bootstrap_host,
1178
+ bootstrap_room=all_requests[0].bootstrap_room,
1112
1179
  )
1113
1180
 
1114
1181
  return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
@@ -252,7 +252,7 @@ ChatCompletionMessageContentPart = Union[
252
252
 
253
253
  class ChatCompletionMessageGenericParam(BaseModel):
254
254
  role: Literal["system", "assistant", "tool"]
255
- content: Union[str, List[ChatCompletionMessageContentTextPart]]
255
+ content: Union[str, List[ChatCompletionMessageContentTextPart], None]
256
256
 
257
257
 
258
258
  class ChatCompletionMessageUserParam(BaseModel):
@@ -355,12 +355,17 @@ class ChatCompletionRequest(BaseModel):
355
355
  stop_token_ids: Optional[List[int]] = None
356
356
  no_stop_trim: bool = False
357
357
  ignore_eos: bool = False
358
+ continue_final_message: bool = False
358
359
  skip_special_tokens: bool = True
359
360
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
360
361
  session_params: Optional[Dict] = None
361
362
  separate_reasoning: bool = True
362
363
  stream_reasoning: bool = True
363
364
 
365
+ # For PD disaggregation
366
+ bootstrap_host: Optional[str] = None
367
+ bootstrap_room: Optional[int] = None
368
+
364
369
 
365
370
  class FunctionResponse(BaseModel):
366
371
  """Function response."""
sglang/srt/server_args.py CHANGED
@@ -26,11 +26,8 @@ from sglang.srt.hf_transformers_utils import check_gguf_file
26
26
  from sglang.srt.reasoning_parser import ReasoningParser
27
27
  from sglang.srt.utils import (
28
28
  configure_ipv6,
29
- get_amdgpu_memory_capacity,
30
29
  get_device,
31
- get_hpu_memory_capacity,
32
- get_nvgpu_memory_capacity,
33
- is_cuda,
30
+ get_device_memory_capacity,
34
31
  is_flashinfer_available,
35
32
  is_hip,
36
33
  is_port_available,
@@ -49,6 +46,7 @@ class ServerArgs:
49
46
  tokenizer_path: Optional[str] = None
50
47
  tokenizer_mode: str = "auto"
51
48
  skip_tokenizer_init: bool = False
49
+ enable_tokenizer_batch_encode: bool = False
52
50
  load_format: str = "auto"
53
51
  trust_remote_code: bool = False
54
52
  dtype: str = "auto"
@@ -179,6 +177,8 @@ class ServerArgs:
179
177
  tool_call_parser: Optional[str] = None
180
178
  enable_hierarchical_cache: bool = False
181
179
  hicache_ratio: float = 2.0
180
+ hicache_size: int = 0
181
+ hicache_write_policy: str = "write_through_selective"
182
182
  flashinfer_mla_disable_ragged: bool = False
183
183
  warmups: Optional[str] = None
184
184
  moe_dense_tp_size: Optional[int] = None
@@ -218,28 +218,24 @@ class ServerArgs:
218
218
  if self.random_seed is None:
219
219
  self.random_seed = random.randint(0, 1 << 30)
220
220
 
221
- if is_cuda():
222
- gpu_mem = get_nvgpu_memory_capacity()
223
- elif is_hip():
224
- gpu_mem = get_amdgpu_memory_capacity()
225
- elif self.device == "hpu":
226
- gpu_mem = get_hpu_memory_capacity()
227
- else:
228
- # GPU memory is not known yet or no GPU is available.
229
- gpu_mem = None
221
+ gpu_mem = get_device_memory_capacity(self.device)
230
222
 
231
223
  # Set mem fraction static, which depends on the tensor parallelism size
232
224
  if self.mem_fraction_static is None:
233
- if self.tp_size >= 16:
234
- self.mem_fraction_static = 0.79
235
- elif self.tp_size >= 8:
236
- self.mem_fraction_static = 0.81
237
- elif self.tp_size >= 4:
238
- self.mem_fraction_static = 0.85
239
- elif self.tp_size >= 2:
240
- self.mem_fraction_static = 0.87
225
+ if gpu_mem <= 81920:
226
+ if self.tp_size >= 16:
227
+ self.mem_fraction_static = 0.79
228
+ elif self.tp_size >= 8:
229
+ self.mem_fraction_static = 0.81
230
+ elif self.tp_size >= 4:
231
+ self.mem_fraction_static = 0.85
232
+ elif self.tp_size >= 2:
233
+ self.mem_fraction_static = 0.87
234
+ else:
235
+ self.mem_fraction_static = 0.88
241
236
  else:
242
- self.mem_fraction_static = 0.88
237
+ # FIXME: more fine grained auto-selection polices
238
+ self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
243
239
 
244
240
  # Set chunked prefill size, which depends on the gpu memory capacity
245
241
  if self.chunked_prefill_size is None:
@@ -268,8 +264,6 @@ class ServerArgs:
268
264
  self.cuda_graph_max_bs = 8
269
265
  else:
270
266
  self.cuda_graph_max_bs = 80
271
- else:
272
- self.cuda_graph_max_bs = 160
273
267
 
274
268
  # Set kernel backends for hpu device
275
269
  if self.device == "hpu":
@@ -291,13 +285,6 @@ class ServerArgs:
291
285
  if self.grammar_backend is None:
292
286
  self.grammar_backend = "xgrammar"
293
287
 
294
- # Expert parallelism
295
- if self.enable_ep_moe:
296
- self.ep_size = self.tp_size
297
- logger.info(
298
- f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
299
- )
300
-
301
288
  self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
302
289
 
303
290
  # Data parallelism attention
@@ -358,7 +345,18 @@ class ServerArgs:
358
345
 
359
346
  if self.page_size > 1 and self.speculative_eagle_topk > 1:
360
347
  self.speculative_eagle_topk = 1
361
- logger.info("speculative_eagle_topk is changed to 1 when page_size > 1")
348
+ logger.info(
349
+ "speculative_eagle_topk is adjusted to 1 when page_size > 1"
350
+ )
351
+
352
+ if (
353
+ self.speculative_eagle_topk == 1
354
+ and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
355
+ ):
356
+ logger.info(
357
+ "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
358
+ )
359
+ self.speculative_num_draft_tokens = self.speculative_num_steps + 1
362
360
 
363
361
  # The token generated from the verify step is counted.
364
362
  # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
@@ -380,14 +378,10 @@ class ServerArgs:
380
378
  # PD disaggregation
381
379
  if self.disaggregation_mode == "prefill":
382
380
  self.disable_cuda_graph = True
383
- logger.warning("KV cache is forced as chunk cache for decode server")
384
- self.disable_overlap_schedule = True
385
- logger.warning("Overlap scheduler is disabled for prefill server")
381
+ logger.warning("Cuda graph is disabled for prefill server")
386
382
  elif self.disaggregation_mode == "decode":
387
383
  self.disable_radix_cache = True
388
- logger.warning("Cuda graph is disabled for prefill server")
389
- self.disable_overlap_schedule = True
390
- logger.warning("Overlap scheduler is disabled for decode server")
384
+ logger.warning("KV cache is forced as chunk cache for decode server")
391
385
 
392
386
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
393
387
  "1" if self.enable_torch_compile else "0"
@@ -432,6 +426,11 @@ class ServerArgs:
432
426
  action="store_true",
433
427
  help="If set, skip init tokenizer and pass input_ids in generate request",
434
428
  )
429
+ parser.add_argument(
430
+ "--enable-tokenizer-batch-encode",
431
+ action="store_true",
432
+ help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
433
+ )
435
434
  parser.add_argument(
436
435
  "--load-format",
437
436
  type=str,
@@ -1087,7 +1086,7 @@ class ServerArgs:
1087
1086
  parser.add_argument(
1088
1087
  "--tool-call-parser",
1089
1088
  type=str,
1090
- choices=["qwen25", "mistral", "llama3"],
1089
+ choices=["qwen25", "mistral", "llama3", "deepseekv3"],
1091
1090
  default=ServerArgs.tool_call_parser,
1092
1091
  help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
1093
1092
  )
@@ -1099,10 +1098,22 @@ class ServerArgs:
1099
1098
  parser.add_argument(
1100
1099
  "--hicache-ratio",
1101
1100
  type=float,
1102
- required=False,
1103
1101
  default=ServerArgs.hicache_ratio,
1104
1102
  help="The ratio of the size of host KV cache memory pool to the size of device pool.",
1105
1103
  )
1104
+ parser.add_argument(
1105
+ "--hicache-size",
1106
+ type=int,
1107
+ default=ServerArgs.hicache_size,
1108
+ help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
1109
+ )
1110
+ parser.add_argument(
1111
+ "--hicache-write-policy",
1112
+ type=str,
1113
+ choices=["write_back", "write_through", "write_through_selective"],
1114
+ default=ServerArgs.hicache_write_policy,
1115
+ help="The write policy of hierarchical cache.",
1116
+ )
1106
1117
  parser.add_argument(
1107
1118
  "--enable-deepep-moe",
1108
1119
  action="store_true",
@@ -1187,6 +1198,7 @@ class ServerArgs:
1187
1198
  "--disaggregation-transfer-backend",
1188
1199
  type=str,
1189
1200
  default=ServerArgs.disaggregation_transfer_backend,
1201
+ choices=["mooncake", "nixl"],
1190
1202
  help="The backend for disaggregation transfer. Default is mooncake.",
1191
1203
  )
1192
1204
  parser.add_argument(
@@ -4,9 +4,9 @@ from typing import List
4
4
 
5
5
  import torch
6
6
 
7
- from sglang.srt.utils import is_cuda_available, is_hip
7
+ from sglang.srt.utils import is_cuda, is_hip
8
8
 
9
- if is_cuda_available() or is_hip():
9
+ if is_cuda() or is_hip():
10
10
  from sgl_kernel import (
11
11
  build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
12
12
  )
@@ -19,9 +19,9 @@ from sglang.srt.managers.schedule_batch import (
19
19
  from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
20
20
  from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
21
21
  from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
22
- from sglang.srt.utils import fast_topk, is_cuda_available, is_hip, next_power_of_2
22
+ from sglang.srt.utils import fast_topk, is_cuda, is_hip, next_power_of_2
23
23
 
24
- if is_cuda_available():
24
+ if is_cuda():
25
25
  from sgl_kernel import (
26
26
  top_k_renorm_prob,
27
27
  top_p_renorm_prob,
@@ -34,14 +34,9 @@ from sglang.srt.speculative.eagle_utils import (
34
34
  select_top_k_tokens,
35
35
  )
36
36
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
37
- from sglang.srt.utils import (
38
- empty_context,
39
- fast_topk,
40
- get_available_gpu_memory,
41
- is_cuda_available,
42
- )
37
+ from sglang.srt.utils import empty_context, fast_topk, get_available_gpu_memory, is_cuda
43
38
 
44
- if is_cuda_available():
39
+ if is_cuda():
45
40
  from sgl_kernel import segment_packbits
46
41
 
47
42
  logger = logging.getLogger(__name__)
sglang/srt/utils.py CHANGED
@@ -78,10 +78,34 @@ time_infos = {}
78
78
 
79
79
  HIP_FP8_E4M3_FNUZ_MAX = 224.0
80
80
 
81
+ _warned_bool_env_var_keys = set()
82
+
81
83
 
82
84
  def get_bool_env_var(name: str, default: str = "false") -> bool:
83
85
  value = os.getenv(name, default)
84
- return value.lower() in ("true", "1")
86
+ value = value.lower()
87
+
88
+ truthy_values = ("true", "1")
89
+ falsy_values = ("false", "0")
90
+
91
+ if (value not in truthy_values) and (value not in falsy_values):
92
+ if value not in _warned_bool_env_var_keys:
93
+ logger.warning(
94
+ f"get_bool_env_var({name}) see non-understandable value={value} and treat as false"
95
+ )
96
+ _warned_bool_env_var_keys.add(value)
97
+
98
+ return value in truthy_values
99
+
100
+
101
+ def get_int_env_var(name: str, default: int = 0) -> int:
102
+ value = os.getenv(name)
103
+ if value is None or not value.strip():
104
+ return default
105
+ try:
106
+ return int(value)
107
+ except ValueError:
108
+ return default
85
109
 
86
110
 
87
111
  # https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
@@ -130,10 +154,6 @@ def is_flashinfer_available():
130
154
  return importlib.util.find_spec("flashinfer") is not None and is_cuda()
131
155
 
132
156
 
133
- def is_cuda_available():
134
- return is_cuda()
135
-
136
-
137
157
  _ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(
138
158
  "SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false"
139
159
  )
@@ -774,6 +794,8 @@ def add_api_key_middleware(app, api_key: str):
774
794
  return await call_next(request)
775
795
  if request.url.path.startswith("/health"):
776
796
  return await call_next(request)
797
+ if request.url.path.startswith("/metrics"):
798
+ return await call_next(request)
777
799
  if request.headers.get("Authorization") != "Bearer " + api_key:
778
800
  return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
779
801
  return await call_next(request)
@@ -930,6 +952,8 @@ def get_zmq_socket(
930
952
  buf_size = -1
931
953
 
932
954
  socket = context.socket(socket_type)
955
+ if endpoint.find("[") != -1:
956
+ socket.setsockopt(zmq.IPV6, 1)
933
957
 
934
958
  def set_send_opt():
935
959
  socket.setsockopt(zmq.SNDHWM, 0)
@@ -1146,6 +1170,20 @@ def get_hpu_memory_capacity():
1146
1170
  )
1147
1171
 
1148
1172
 
1173
+ def get_device_memory_capacity(device: str = None):
1174
+ if is_cuda():
1175
+ gpu_mem = get_nvgpu_memory_capacity()
1176
+ elif is_hip():
1177
+ gpu_mem = get_amdgpu_memory_capacity()
1178
+ elif device == "hpu":
1179
+ gpu_mem = get_hpu_memory_capacity()
1180
+ else:
1181
+ # GPU memory is not known yet or no GPU is available.
1182
+ gpu_mem = None
1183
+
1184
+ return gpu_mem
1185
+
1186
+
1149
1187
  # Copy from pytorch and OpenRLHF to allow creating multiple main groups.
1150
1188
  # https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
1151
1189
  # https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
@@ -1913,6 +1951,8 @@ def is_page_size_one(server_args):
1913
1951
  return server_args.page_size == 1
1914
1952
 
1915
1953
 
1954
+ # TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
1955
+ # TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
1916
1956
  def is_no_spec_infer_or_topk_one(server_args):
1917
1957
  return server_args.speculative_eagle_topk is None or (
1918
1958
  server_args.speculative_eagle_topk is not None
@@ -1930,6 +1970,7 @@ def is_fa3_default_architecture(hf_config):
1930
1970
  "Llama4ForConditionalGeneration",
1931
1971
  "LlamaForCausalLM",
1932
1972
  "MistralForCausalLM",
1973
+ "Gemma2ForCausalLM",
1933
1974
  }
1934
1975
  return architectures[0] in default_archs
1935
1976
 
sglang/test/test_utils.py CHANGED
@@ -450,7 +450,9 @@ def popen_launch_server(
450
450
 
451
451
  return_code = process.poll()
452
452
  if return_code is not None:
453
- raise Exception(f"Server unexpectedly exits ({return_code=}).")
453
+ raise Exception(
454
+ f"Server unexpectedly exits ({return_code=}). Usually there will be error logs describing the cause far above this line."
455
+ )
454
456
 
455
457
  time.sleep(10)
456
458
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.5.post2"
1
+ __version__ = "0.4.5.post3"