sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. sglang/bench_one_batch.py +149 -34
  2. sglang/bench_serving.py +73 -14
  3. sglang/compile_deep_gemm.py +13 -7
  4. sglang/launch_server.py +2 -0
  5. sglang/srt/batch_invariant_ops/__init__.py +2 -0
  6. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
  7. sglang/srt/checkpoint_engine/__init__.py +9 -0
  8. sglang/srt/checkpoint_engine/update.py +317 -0
  9. sglang/srt/compilation/backend.py +1 -1
  10. sglang/srt/configs/__init__.py +2 -0
  11. sglang/srt/configs/deepseek_ocr.py +542 -10
  12. sglang/srt/configs/deepseekvl2.py +95 -194
  13. sglang/srt/configs/kimi_linear.py +160 -0
  14. sglang/srt/configs/mamba_utils.py +66 -0
  15. sglang/srt/configs/model_config.py +30 -7
  16. sglang/srt/constants.py +7 -0
  17. sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
  18. sglang/srt/disaggregation/decode.py +34 -6
  19. sglang/srt/disaggregation/nixl/conn.py +2 -2
  20. sglang/srt/disaggregation/prefill.py +25 -3
  21. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
  22. sglang/srt/distributed/parallel_state.py +9 -12
  23. sglang/srt/entrypoints/engine.py +31 -20
  24. sglang/srt/entrypoints/grpc_server.py +0 -1
  25. sglang/srt/entrypoints/http_server.py +94 -94
  26. sglang/srt/entrypoints/openai/protocol.py +7 -1
  27. sglang/srt/entrypoints/openai/serving_chat.py +42 -0
  28. sglang/srt/entrypoints/openai/serving_completions.py +10 -0
  29. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  30. sglang/srt/environ.py +23 -2
  31. sglang/srt/eplb/expert_distribution.py +64 -1
  32. sglang/srt/eplb/expert_location.py +106 -36
  33. sglang/srt/function_call/function_call_parser.py +2 -0
  34. sglang/srt/function_call/minimax_m2.py +367 -0
  35. sglang/srt/grpc/compile_proto.py +3 -0
  36. sglang/srt/layers/activation.py +6 -0
  37. sglang/srt/layers/attention/ascend_backend.py +233 -5
  38. sglang/srt/layers/attention/attention_registry.py +3 -0
  39. sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
  40. sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
  41. sglang/srt/layers/attention/fla/kda.py +1359 -0
  42. sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
  43. sglang/srt/layers/attention/flashattention_backend.py +19 -8
  44. sglang/srt/layers/attention/flashinfer_backend.py +10 -1
  45. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
  46. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  47. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
  48. sglang/srt/layers/attention/mamba/mamba.py +20 -11
  49. sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
  50. sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
  51. sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
  52. sglang/srt/layers/attention/nsa/transform_index.py +1 -1
  53. sglang/srt/layers/attention/nsa_backend.py +157 -23
  54. sglang/srt/layers/attention/triton_backend.py +4 -1
  55. sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
  56. sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
  57. sglang/srt/layers/attention/utils.py +78 -0
  58. sglang/srt/layers/communicator.py +24 -1
  59. sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
  60. sglang/srt/layers/layernorm.py +35 -6
  61. sglang/srt/layers/logits_processor.py +9 -20
  62. sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
  63. sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
  64. sglang/srt/layers/moe/ep_moe/layer.py +78 -289
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
  67. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
  68. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
  69. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
  70. sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
  71. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
  72. sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
  73. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  74. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  75. sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
  76. sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
  77. sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
  78. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  79. sglang/srt/layers/moe/topk.py +35 -10
  80. sglang/srt/layers/moe/utils.py +3 -4
  81. sglang/srt/layers/pooler.py +21 -2
  82. sglang/srt/layers/quantization/__init__.py +13 -84
  83. sglang/srt/layers/quantization/auto_round.py +394 -0
  84. sglang/srt/layers/quantization/awq.py +0 -3
  85. sglang/srt/layers/quantization/base_config.py +7 -0
  86. sglang/srt/layers/quantization/fp8.py +68 -63
  87. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  88. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  89. sglang/srt/layers/quantization/gguf.py +566 -0
  90. sglang/srt/layers/quantization/modelopt_quant.py +168 -11
  91. sglang/srt/layers/quantization/mxfp4.py +30 -38
  92. sglang/srt/layers/quantization/unquant.py +23 -45
  93. sglang/srt/layers/quantization/w4afp8.py +38 -2
  94. sglang/srt/layers/radix_attention.py +5 -2
  95. sglang/srt/layers/rotary_embedding.py +130 -46
  96. sglang/srt/layers/sampler.py +12 -1
  97. sglang/srt/lora/lora_registry.py +9 -0
  98. sglang/srt/managers/async_mm_data_processor.py +122 -0
  99. sglang/srt/managers/data_parallel_controller.py +30 -3
  100. sglang/srt/managers/detokenizer_manager.py +3 -0
  101. sglang/srt/managers/io_struct.py +29 -4
  102. sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
  103. sglang/srt/managers/schedule_batch.py +74 -15
  104. sglang/srt/managers/scheduler.py +185 -144
  105. sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
  106. sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
  107. sglang/srt/managers/scheduler_pp_mixin.py +7 -2
  108. sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
  109. sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
  110. sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
  111. sglang/srt/managers/session_controller.py +6 -5
  112. sglang/srt/managers/tokenizer_manager.py +165 -78
  113. sglang/srt/managers/tp_worker.py +24 -1
  114. sglang/srt/mem_cache/base_prefix_cache.py +23 -4
  115. sglang/srt/mem_cache/common.py +1 -0
  116. sglang/srt/mem_cache/hicache_storage.py +7 -1
  117. sglang/srt/mem_cache/memory_pool.py +253 -57
  118. sglang/srt/mem_cache/memory_pool_host.py +12 -5
  119. sglang/srt/mem_cache/radix_cache.py +4 -0
  120. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  121. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
  122. sglang/srt/metrics/collector.py +46 -3
  123. sglang/srt/model_executor/cuda_graph_runner.py +15 -3
  124. sglang/srt/model_executor/forward_batch_info.py +55 -14
  125. sglang/srt/model_executor/model_runner.py +77 -170
  126. sglang/srt/model_executor/npu_graph_runner.py +7 -3
  127. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
  128. sglang/srt/model_loader/weight_utils.py +1 -1
  129. sglang/srt/models/bailing_moe.py +9 -2
  130. sglang/srt/models/deepseek_nextn.py +11 -2
  131. sglang/srt/models/deepseek_v2.py +296 -78
  132. sglang/srt/models/glm4.py +391 -77
  133. sglang/srt/models/glm4_moe.py +322 -354
  134. sglang/srt/models/glm4_moe_nextn.py +4 -14
  135. sglang/srt/models/glm4v.py +196 -55
  136. sglang/srt/models/glm4v_moe.py +29 -197
  137. sglang/srt/models/gpt_oss.py +1 -10
  138. sglang/srt/models/kimi_linear.py +678 -0
  139. sglang/srt/models/llama4.py +1 -1
  140. sglang/srt/models/llama_eagle3.py +11 -1
  141. sglang/srt/models/longcat_flash.py +2 -2
  142. sglang/srt/models/minimax_m2.py +922 -0
  143. sglang/srt/models/nvila.py +355 -0
  144. sglang/srt/models/nvila_lite.py +184 -0
  145. sglang/srt/models/qwen2.py +23 -2
  146. sglang/srt/models/qwen2_moe.py +30 -15
  147. sglang/srt/models/qwen3.py +35 -5
  148. sglang/srt/models/qwen3_moe.py +18 -12
  149. sglang/srt/models/qwen3_next.py +7 -0
  150. sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
  151. sglang/srt/multimodal/processors/base_processor.py +1 -0
  152. sglang/srt/multimodal/processors/glm4v.py +1 -1
  153. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  154. sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
  155. sglang/srt/multiplex/multiplexing_mixin.py +209 -0
  156. sglang/srt/multiplex/pdmux_context.py +164 -0
  157. sglang/srt/parser/conversation.py +7 -1
  158. sglang/srt/parser/reasoning_parser.py +28 -1
  159. sglang/srt/sampling/custom_logit_processor.py +67 -1
  160. sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
  161. sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
  162. sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
  163. sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
  164. sglang/srt/server_args.py +459 -199
  165. sglang/srt/single_batch_overlap.py +2 -4
  166. sglang/srt/speculative/draft_utils.py +16 -0
  167. sglang/srt/speculative/eagle_info.py +42 -36
  168. sglang/srt/speculative/eagle_info_v2.py +68 -25
  169. sglang/srt/speculative/eagle_utils.py +261 -16
  170. sglang/srt/speculative/eagle_worker.py +11 -3
  171. sglang/srt/speculative/eagle_worker_v2.py +15 -9
  172. sglang/srt/speculative/spec_info.py +305 -31
  173. sglang/srt/speculative/spec_utils.py +44 -8
  174. sglang/srt/tracing/trace.py +121 -12
  175. sglang/srt/utils/common.py +142 -74
  176. sglang/srt/utils/hf_transformers_utils.py +38 -12
  177. sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
  178. sglang/test/kits/radix_cache_server_kit.py +50 -0
  179. sglang/test/runners.py +31 -7
  180. sglang/test/simple_eval_common.py +5 -3
  181. sglang/test/simple_eval_humaneval.py +1 -0
  182. sglang/test/simple_eval_math.py +1 -0
  183. sglang/test/simple_eval_mmlu.py +1 -0
  184. sglang/test/simple_eval_mmmu_vlm.py +1 -0
  185. sglang/test/test_deterministic.py +235 -12
  186. sglang/test/test_deterministic_utils.py +2 -1
  187. sglang/test/test_utils.py +7 -1
  188. sglang/version.py +1 -1
  189. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
  190. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
  191. sglang/srt/models/vila.py +0 -306
  192. /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
  193. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
  194. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
  195. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
@@ -101,7 +101,7 @@ class Engine(EngineBase):
101
101
 
102
102
  Note:
103
103
  1. The HTTP server, Engine, and TokenizerManager all run in the main process.
104
- 2. Inter-process communication (IPC) is handled via the ZMQ library, with each process using a different port.
104
+ 2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
105
105
  """
106
106
 
107
107
  def __init__(self, **kwargs):
@@ -109,6 +109,8 @@ class Engine(EngineBase):
109
109
  The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
110
110
  Please refer to `ServerArgs` for the documentation.
111
111
  """
112
+
113
+ # Parse server_args
112
114
  if "server_args" in kwargs:
113
115
  # Directly load server_args
114
116
  server_args = kwargs["server_args"]
@@ -118,34 +120,36 @@ class Engine(EngineBase):
118
120
  # Do not print logs by default
119
121
  kwargs["log_level"] = "error"
120
122
  server_args = ServerArgs(**kwargs)
123
+ self.server_args = server_args
124
+ logger.info(f"{server_args=}")
121
125
 
122
126
  # Shutdown the subprocesses automatically when the program exits
123
127
  atexit.register(self.shutdown)
124
128
 
125
- # Allocate ports for inter-process communications
126
- self.port_args = PortArgs.init_new(server_args)
127
- logger.info(f"{server_args=}")
128
-
129
129
  # Launch subprocesses
130
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
131
- server_args=server_args,
132
- port_args=self.port_args,
130
+ tokenizer_manager, template_manager, scheduler_info, port_args = (
131
+ _launch_subprocesses(server_args=server_args)
133
132
  )
134
- self.server_args = server_args
135
133
  self.tokenizer_manager = tokenizer_manager
136
134
  self.template_manager = template_manager
137
135
  self.scheduler_info = scheduler_info
136
+ self.port_args = port_args
138
137
 
138
+ # Initialize ZMQ sockets
139
139
  context = zmq.Context(2)
140
140
  self.send_to_rpc = get_zmq_socket(
141
141
  context, zmq.DEALER, self.port_args.rpc_ipc_name, True
142
142
  )
143
143
 
144
+ # Enable tracing
144
145
  if server_args.enable_trace:
145
- process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
146
- if server_args.disaggregation_mode == "null":
147
- thread_label = "Tokenizer"
148
- trace_set_thread_info(thread_label)
146
+ process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
147
+ thread_label = "Tokenizer"
148
+ if server_args.disaggregation_mode == "prefill":
149
+ thread_label = "Prefill Tokenizer"
150
+ elif server_args.disaggregation_mode == "decode":
151
+ thread_label = "Decode Tokenizer"
152
+ trace_set_thread_info(thread_label)
149
153
 
150
154
  try:
151
155
  self.loop = asyncio.get_running_loop()
@@ -311,6 +315,7 @@ class Engine(EngineBase):
311
315
  image_data: Optional[MultimodalDataInputFormat] = None,
312
316
  audio_data: Optional[MultimodalDataInputFormat] = None,
313
317
  video_data: Optional[MultimodalDataInputFormat] = None,
318
+ dimensions: Optional[int] = None,
314
319
  ) -> Dict:
315
320
  """
316
321
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
@@ -321,6 +326,7 @@ class Engine(EngineBase):
321
326
  image_data=image_data,
322
327
  audio_data=audio_data,
323
328
  video_data=video_data,
329
+ dimensions=dimensions,
324
330
  )
325
331
  generator = self.tokenizer_manager.generate_request(obj, None)
326
332
  ret = self.loop.run_until_complete(generator.__anext__())
@@ -332,6 +338,7 @@ class Engine(EngineBase):
332
338
  image_data: Optional[MultimodalDataInputFormat] = None,
333
339
  audio_data: Optional[MultimodalDataInputFormat] = None,
334
340
  video_data: Optional[MultimodalDataInputFormat] = None,
341
+ dimensions: Optional[int] = None,
335
342
  ) -> Dict:
336
343
  """
337
344
  Asynchronous version of encode method.
@@ -344,6 +351,7 @@ class Engine(EngineBase):
344
351
  image_data=image_data,
345
352
  audio_data=audio_data,
346
353
  video_data=video_data,
354
+ dimensions=dimensions,
347
355
  )
348
356
  generator = self.tokenizer_manager.generate_request(obj, None)
349
357
  return await generator.__anext__()
@@ -669,18 +677,21 @@ class Engine(EngineBase):
669
677
  def _set_envs_and_config(server_args: ServerArgs):
670
678
  # Set global environments
671
679
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
672
- os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
680
+ if "NCCL_CUMEM_ENABLE" not in os.environ:
681
+ os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
673
682
  if not server_args.enable_symm_mem:
674
683
  os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
675
- os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
684
+ os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "8"
676
685
  os.environ["CUDA_MODULE_LOADING"] = "AUTO"
677
- # flashinfer uses this environment variable for various kernels from MoE to quant kernels
686
+
678
687
  if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
688
+ # flashinfer uses this environment variable for various kernels from MoE to quant kernels
679
689
  os.environ["TRTLLM_ENABLE_PDL"] = "1"
680
690
 
681
691
  if os.environ.get("CUTE_DSL_LOG_LEVEL") is None:
682
692
  # Default to warning level, to avoid too many logs
683
693
  os.environ["CUTE_DSL_LOG_LEVEL"] = "30"
694
+
684
695
  if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None:
685
696
  # Need to set log to console, otherwise the log level won't take effect
686
697
  os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1"
@@ -709,7 +720,7 @@ def _set_envs_and_config(server_args: ServerArgs):
709
720
  if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
710
721
  assert_pkg_version(
711
722
  "sgl-kernel",
712
- "0.3.16.post3",
723
+ "0.3.16.post4",
713
724
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
714
725
  )
715
726
 
@@ -840,7 +851,7 @@ def _launch_subprocesses(
840
851
 
841
852
  if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
842
853
  # When using `Engine` as a Python API, we don't want to block here.
843
- return None, None, None
854
+ return None, None, None, port_args
844
855
 
845
856
  launch_dummy_health_check_server(
846
857
  server_args.host, server_args.port, server_args.enable_metrics
@@ -851,7 +862,7 @@ def _launch_subprocesses(
851
862
  logger.error(
852
863
  f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
853
864
  )
854
- return None, None, None
865
+ return None, None, None, port_args
855
866
 
856
867
  # Launch detokenizer process
857
868
  detoken_proc = mp.Process(
@@ -897,4 +908,4 @@ def _launch_subprocesses(
897
908
 
898
909
  tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
899
910
 
900
- return tokenizer_manager, template_manager, scheduler_info
911
+ return tokenizer_manager, template_manager, scheduler_info, port_args
@@ -999,7 +999,6 @@ def _wait_and_warmup_grpc(
999
999
  # Mark health service as SERVING after warmup completes
1000
1000
  if health_servicer:
1001
1001
  health_servicer.set_serving()
1002
- logger.info("Health service marked as SERVING")
1003
1002
 
1004
1003
  logger.info("The server is fired up and ready to roll!")
1005
1004
 
@@ -20,7 +20,7 @@ This file implements HTTP APIs for the inference engine via fastapi.
20
20
  import asyncio
21
21
  import dataclasses
22
22
  import logging
23
- import multiprocessing as multiprocessing
23
+ import multiprocessing
24
24
  import os
25
25
  import tempfile
26
26
  import threading
@@ -165,6 +165,7 @@ async def init_multi_tokenizer() -> ServerArgs:
165
165
  server_args.api_key is None
166
166
  ), "API key is not supported in multi-tokenizer mode"
167
167
 
168
+ # Create a new ipc name for the current process
168
169
  port_args.tokenizer_ipc_name = (
169
170
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
170
171
  )
@@ -184,6 +185,7 @@ async def init_multi_tokenizer() -> ServerArgs:
184
185
  )
185
186
 
186
187
  tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
188
+
187
189
  set_global_state(
188
190
  _GlobalState(
189
191
  tokenizer_manager=tokenizer_manager,
@@ -192,36 +194,38 @@ async def init_multi_tokenizer() -> ServerArgs:
192
194
  )
193
195
  )
194
196
 
195
- if server_args.enable_trace:
196
- process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
197
- if server_args.disaggregation_mode == "null":
198
- thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
199
- trace_set_thread_info(thread_label)
200
-
201
197
  return server_args
202
198
 
203
199
 
204
200
  @asynccontextmanager
205
201
  async def lifespan(fast_api_app: FastAPI):
206
- if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
202
+ if getattr(fast_api_app, "is_single_tokenizer_mode", False):
203
+ server_args = fast_api_app.server_args
204
+ warmup_thread_args = fast_api_app.warmup_thread_args
205
+ thread_label = "Tokenizer"
206
+ else:
207
207
  # Initialize multi-tokenizer support for worker processes
208
- fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
209
-
210
- # only metrics middleware is supported in multi-tokenizer mode
211
- worker_pid = os.getpid()
212
- if fast_api_app.server_args.enable_metrics:
213
- add_prometheus_middleware(app)
214
- enable_func_timer()
215
-
216
- logger.info(f"Worker {worker_pid} added prometheus middleware")
217
- fast_api_app.warmup_thread = threading.Thread(
218
- target=_wait_and_warmup,
219
- args=(
220
- fast_api_app.server_args,
221
- None, # pipe_finish_writer not needed in worker
222
- None, # launch_callback not needed in worker
223
- ),
208
+ server_args = await init_multi_tokenizer()
209
+ warmup_thread_args = (
210
+ server_args,
211
+ None,
212
+ None,
224
213
  )
214
+ thread_label = f"MultiTokenizer-{_global_state.tokenizer_manager.worker_id}"
215
+
216
+ # Add prometheus middleware
217
+ if server_args.enable_metrics:
218
+ add_prometheus_middleware(app)
219
+ enable_func_timer()
220
+
221
+ # Init tracing
222
+ if server_args.enable_trace:
223
+ process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
224
+ if server_args.disaggregation_mode == "prefill":
225
+ thread_label = "Prefill" + thread_label
226
+ elif server_args.disaggregation_mode == "decode":
227
+ thread_label = "Decode" + thread_label
228
+ trace_set_thread_info(thread_label)
225
229
 
226
230
  # Initialize OpenAI serving handlers
227
231
  fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
@@ -249,8 +253,7 @@ async def lifespan(fast_api_app: FastAPI):
249
253
  _global_state.tokenizer_manager
250
254
  )
251
255
 
252
- server_args: ServerArgs = fast_api_app.server_args
253
-
256
+ # Launch tool server
254
257
  tool_server = None
255
258
  if server_args.tool_server == "demo":
256
259
  from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
@@ -274,12 +277,11 @@ async def lifespan(fast_api_app: FastAPI):
274
277
  enable_force_include_usage=True,
275
278
  tool_server=tool_server,
276
279
  )
277
- except Exception as e:
278
- import traceback
279
-
280
- traceback.print_exc()
281
- logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
280
+ except Exception:
281
+ traceback = get_exception_traceback()
282
+ logger.warning(f"Can not initialize OpenAIServingResponses, error: {traceback}")
282
283
 
284
+ # Execute custom warmups
283
285
  if server_args.warmups is not None:
284
286
  await execute_warmups(
285
287
  server_args.disaggregation_mode,
@@ -288,18 +290,18 @@ async def lifespan(fast_api_app: FastAPI):
288
290
  )
289
291
  logger.info("Warmup ended")
290
292
 
291
- warmup_thread = getattr(fast_api_app, "warmup_thread", None)
292
- if warmup_thread is not None:
293
- warmup_thread.start()
293
+ # Execute the general warmup
294
+ warmup_thread = threading.Thread(
295
+ target=_wait_and_warmup,
296
+ args=warmup_thread_args,
297
+ )
298
+ warmup_thread.start()
294
299
 
300
+ # Start the HTTP server
295
301
  try:
296
302
  yield
297
303
  finally:
298
- if server_args.tokenizer_worker_num > 1:
299
- pid = os.getpid()
300
- logger.info(f"uvicorn worker {pid} ending...")
301
- warmup_thread.join()
302
- logger.info(f"uvicorn worker {pid} ended.")
304
+ warmup_thread.join()
303
305
 
304
306
 
305
307
  # Fast API
@@ -499,6 +501,11 @@ async def get_server_info():
499
501
  internal_states: List[Dict[Any, Any]] = (
500
502
  await _global_state.tokenizer_manager.get_internal_state()
501
503
  )
504
+
505
+ # This field is not serializable.
506
+ if hasattr(_global_state.tokenizer_manager.server_args, "model_config"):
507
+ del _global_state.tokenizer_manager.server_args.model_config
508
+
502
509
  return {
503
510
  **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
504
511
  **_global_state.scheduler_info,
@@ -1164,6 +1171,8 @@ async def available_models():
1164
1171
  """Show available models. OpenAI-compatible endpoint."""
1165
1172
  served_model_names = [_global_state.tokenizer_manager.served_model_name]
1166
1173
  model_cards = []
1174
+
1175
+ # Add base model
1167
1176
  for served_model_name in served_model_names:
1168
1177
  model_cards.append(
1169
1178
  ModelCard(
@@ -1172,6 +1181,20 @@ async def available_models():
1172
1181
  max_model_len=_global_state.tokenizer_manager.model_config.context_len,
1173
1182
  )
1174
1183
  )
1184
+
1185
+ # Add loaded LoRA adapters
1186
+ if _global_state.tokenizer_manager.server_args.enable_lora:
1187
+ lora_registry = _global_state.tokenizer_manager.lora_registry
1188
+ for _, lora_ref in lora_registry.get_all_adapters().items():
1189
+ model_cards.append(
1190
+ ModelCard(
1191
+ id=lora_ref.lora_name,
1192
+ root=lora_ref.lora_path,
1193
+ parent=served_model_names[0],
1194
+ max_model_len=None,
1195
+ )
1196
+ )
1197
+
1175
1198
  return ModelList(data=model_cards)
1176
1199
 
1177
1200
 
@@ -1328,27 +1351,12 @@ def launch_server(
1328
1351
  3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
1329
1352
 
1330
1353
  Note:
1331
- 1. The HTTP server, Engine, and TokenizerManager both run in the main process.
1354
+ 1. The HTTP server, Engine, and TokenizerManager all run in the main process.
1332
1355
  2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
1333
1356
  """
1334
- if server_args.tokenizer_worker_num > 1:
1335
- port_args = PortArgs.init_new(server_args)
1336
- port_args.tokenizer_worker_ipc_name = (
1337
- f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
1338
- )
1339
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1340
- server_args=server_args, port_args=port_args
1341
- )
1342
- else:
1343
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1344
- server_args=server_args,
1345
- )
1346
-
1347
- if server_args.enable_trace:
1348
- process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
1349
- if server_args.disaggregation_mode == "null":
1350
- thread_label = "Tokenizer"
1351
- trace_set_thread_info(thread_label)
1357
+ tokenizer_manager, template_manager, scheduler_info, port_args = (
1358
+ _launch_subprocesses(server_args=server_args)
1359
+ )
1352
1360
 
1353
1361
  set_global_state(
1354
1362
  _GlobalState(
@@ -1358,40 +1366,45 @@ def launch_server(
1358
1366
  )
1359
1367
  )
1360
1368
 
1361
- if server_args.tokenizer_worker_num > 1:
1362
- multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1363
- port_args,
1369
+ # Pass additional arguments to the lifespan function.
1370
+ # They will be used for additional initialization setups.
1371
+ if server_args.tokenizer_worker_num == 1:
1372
+ # If it is single tokenizer mode, we can pass the arguments by attributes of the app object.
1373
+ app.is_single_tokenizer_mode = True
1374
+ app.server_args = server_args
1375
+ app.warmup_thread_args = (
1364
1376
  server_args,
1365
- scheduler_info,
1377
+ pipe_finish_writer,
1378
+ launch_callback,
1366
1379
  )
1367
- else:
1380
+
1368
1381
  # Add api key authorization
1382
+ # This is only supported in single tokenizer mode.
1369
1383
  if server_args.api_key:
1370
1384
  add_api_key_middleware(app, server_args.api_key)
1371
-
1372
- # Add prometheus middleware
1373
- if server_args.enable_metrics:
1374
- add_prometheus_middleware(app)
1375
- enable_func_timer()
1376
-
1377
- # Send a warmup request - we will create the thread launch it
1378
- # in the lifespan after all other warmups have fired.
1379
- warmup_thread = threading.Thread(
1380
- target=_wait_and_warmup,
1381
- args=(
1382
- server_args,
1383
- pipe_finish_writer,
1384
- launch_callback,
1385
- ),
1385
+ else:
1386
+ # If it is multi-tokenizer mode, we need to write the arguments to shared memory
1387
+ # for other worker processes to read.
1388
+ app.is_single_tokenizer_mode = False
1389
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1390
+ port_args, server_args, scheduler_info
1386
1391
  )
1387
- app.warmup_thread = warmup_thread
1388
1392
 
1389
1393
  try:
1390
1394
  # Update logging configs
1391
1395
  set_uvicorn_logging_configs()
1392
- app.server_args = server_args
1396
+
1393
1397
  # Listen for HTTP requests
1394
- if server_args.tokenizer_worker_num > 1:
1398
+ if server_args.tokenizer_worker_num == 1:
1399
+ uvicorn.run(
1400
+ app,
1401
+ host=server_args.host,
1402
+ port=server_args.port,
1403
+ log_level=server_args.log_level_http or server_args.log_level,
1404
+ timeout_keep_alive=5,
1405
+ loop="uvloop",
1406
+ )
1407
+ else:
1395
1408
  from uvicorn.config import LOGGING_CONFIG
1396
1409
 
1397
1410
  LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
@@ -1399,7 +1412,6 @@ def launch_server(
1399
1412
  "level": "INFO",
1400
1413
  "propagate": False,
1401
1414
  }
1402
-
1403
1415
  monkey_patch_uvicorn_multiprocessing()
1404
1416
 
1405
1417
  uvicorn.run(
@@ -1411,22 +1423,10 @@ def launch_server(
1411
1423
  loop="uvloop",
1412
1424
  workers=server_args.tokenizer_worker_num,
1413
1425
  )
1414
- else:
1415
- app.is_single_tokenizer_mode = True
1416
- uvicorn.run(
1417
- app,
1418
- host=server_args.host,
1419
- port=server_args.port,
1420
- log_level=server_args.log_level_http or server_args.log_level,
1421
- timeout_keep_alive=5,
1422
- loop="uvloop",
1423
- )
1424
1426
  finally:
1425
1427
  if server_args.tokenizer_worker_num > 1:
1426
1428
  multi_tokenizer_args_shm.unlink()
1427
1429
  _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1428
- else:
1429
- warmup_thread.join()
1430
1430
 
1431
1431
 
1432
1432
  def _execute_server_warmup(
@@ -37,7 +37,11 @@ from pydantic import (
37
37
  model_validator,
38
38
  )
39
39
  from typing_extensions import Literal
40
- from xgrammar import StructuralTag
40
+
41
+ try:
42
+ from xgrammar import StructuralTag
43
+ except:
44
+ StructuralTag = Any
41
45
 
42
46
  from sglang.utils import convert_json_schema_to_str
43
47
 
@@ -54,6 +58,7 @@ class ModelCard(BaseModel):
54
58
  created: int = Field(default_factory=lambda: int(time.time()))
55
59
  owned_by: str = "sglang"
56
60
  root: Optional[str] = None
61
+ parent: Optional[str] = None
57
62
  max_model_len: Optional[int] = None
58
63
 
59
64
 
@@ -108,6 +113,7 @@ class UsageInfo(BaseModel):
108
113
 
109
114
  class StreamOptions(BaseModel):
110
115
  include_usage: Optional[bool] = False
116
+ continuous_usage_stats: Optional[bool] = False
111
117
 
112
118
 
113
119
  class JsonSchemaResponseFormat(BaseModel):
@@ -535,6 +535,17 @@ class OpenAIServingChat(OpenAIServingBase):
535
535
  choices=[choice_data],
536
536
  model=request.model,
537
537
  )
538
+
539
+ # Add usage stats if continuous_usage_stats is enabled
540
+ if (
541
+ request.stream_options
542
+ and request.stream_options.continuous_usage_stats
543
+ ):
544
+ chunk.usage = UsageProcessor.calculate_token_usage(
545
+ prompt_tokens=prompt_tokens.get(index, 0),
546
+ completion_tokens=completion_tokens.get(index, 0),
547
+ )
548
+
538
549
  yield f"data: {chunk.model_dump_json()}\n\n"
539
550
 
540
551
  # Handle tool calls
@@ -579,6 +590,17 @@ class OpenAIServingChat(OpenAIServingBase):
579
590
  choices=[choice_data],
580
591
  model=request.model,
581
592
  )
593
+
594
+ # Add usage stats if continuous_usage_stats is enabled
595
+ if (
596
+ request.stream_options
597
+ and request.stream_options.continuous_usage_stats
598
+ ):
599
+ chunk.usage = UsageProcessor.calculate_token_usage(
600
+ prompt_tokens=prompt_tokens.get(index, 0),
601
+ completion_tokens=completion_tokens.get(index, 0),
602
+ )
603
+
582
604
  yield f"data: {chunk.model_dump_json()}\n\n"
583
605
 
584
606
  # Send finish_reason chunks for each index that completed
@@ -1056,6 +1078,16 @@ class OpenAIServingChat(OpenAIServingBase):
1056
1078
  choices=[choice_data],
1057
1079
  model=request.model,
1058
1080
  )
1081
+
1082
+ # Add usage stats if continuous_usage_stats is enabled
1083
+ if request.stream_options and request.stream_options.continuous_usage_stats:
1084
+ prompt_tokens = content["meta_info"].get("prompt_tokens", 0)
1085
+ completion_tokens = content["meta_info"].get("completion_tokens", 0)
1086
+ chunk.usage = UsageProcessor.calculate_token_usage(
1087
+ prompt_tokens=prompt_tokens,
1088
+ completion_tokens=completion_tokens,
1089
+ )
1090
+
1059
1091
  yield f"data: {chunk.model_dump_json()}\n\n"
1060
1092
 
1061
1093
  # Yield tool calls
@@ -1096,6 +1128,16 @@ class OpenAIServingChat(OpenAIServingBase):
1096
1128
  choices=[choice_data],
1097
1129
  model=request.model,
1098
1130
  )
1131
+
1132
+ # Add usage stats if continuous_usage_stats is enabled
1133
+ if request.stream_options and request.stream_options.continuous_usage_stats:
1134
+ prompt_tokens = content["meta_info"].get("prompt_tokens", 0)
1135
+ completion_tokens = content["meta_info"].get("completion_tokens", 0)
1136
+ chunk.usage = UsageProcessor.calculate_token_usage(
1137
+ prompt_tokens=prompt_tokens,
1138
+ completion_tokens=completion_tokens,
1139
+ )
1140
+
1099
1141
  yield f"data: {chunk.model_dump_json()}\n\n"
1100
1142
 
1101
1143
  def _check_for_unstreamed_tool_args(
@@ -272,6 +272,16 @@ class OpenAIServingCompletion(OpenAIServingBase):
272
272
  model=request.model,
273
273
  )
274
274
 
275
+ # Add usage stats if continuous_usage_stats is enabled
276
+ if (
277
+ request.stream_options
278
+ and request.stream_options.continuous_usage_stats
279
+ ):
280
+ chunk.usage = UsageProcessor.calculate_token_usage(
281
+ prompt_tokens=prompt_tokens.get(index, 0),
282
+ completion_tokens=completion_tokens.get(index, 0),
283
+ )
284
+
275
285
  yield f"data: {chunk.model_dump_json()}\n\n"
276
286
 
277
287
  if request.return_hidden_states and hidden_states:
@@ -126,6 +126,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
126
126
  **prompt_kwargs,
127
127
  rid=request.rid,
128
128
  priority=request.priority,
129
+ dimensions=request.dimensions,
129
130
  )
130
131
 
131
132
  return adapted_request, request
sglang/srt/environ.py CHANGED
@@ -111,25 +111,31 @@ class Envs:
111
111
  # Model & File Download
112
112
  SGLANG_USE_MODELSCOPE = EnvBool(False)
113
113
 
114
+ # Logging Options
115
+ SGLANG_LOG_GC = EnvBool(False)
116
+ SGLANG_LOG_FORWARD_ITERS = EnvBool(False)
117
+ SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
118
+
114
119
  # Test & Debug
115
120
  SGLANG_IS_IN_CI = EnvBool(False)
116
121
  SGLANG_IS_IN_CI_AMD = EnvBool(False)
117
122
  SGLANG_SET_CPU_AFFINITY = EnvBool(False)
118
123
  SGLANG_PROFILE_WITH_STACK = EnvBool(True)
119
124
  SGLANG_RECORD_STEP_TIME = EnvBool(False)
120
- SGLANG_GC_LOG = EnvBool(False)
121
125
  SGLANG_FORCE_SHUTDOWN = EnvBool(False)
122
126
  SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
123
127
  SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
124
128
  SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
125
- SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
126
129
  SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
127
130
  SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
128
131
  SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
132
+ SGLANG_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS = EnvInt(500)
133
+ SGLANG_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE = EnvInt(64)
129
134
 
130
135
  # Scheduler: memory leak test
131
136
  SGLANG_TEST_RETRACT = EnvBool(False)
132
137
  SGLANG_TEST_RETRACT_INTERVAL = EnvInt(3)
138
+ SGLANG_TEST_RETRACT_NO_PREFILL_BS = EnvInt(2 ** 31)
133
139
  SGLANG_ENABLE_RUNTIME_MEM_LEAK_CHECK = EnvBool(False)
134
140
 
135
141
  # Scheduler: new token ratio hyperparameters
@@ -177,6 +183,7 @@ class Envs:
177
183
 
178
184
  # Triton
179
185
  SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
186
+ SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE = EnvBool(False)
180
187
 
181
188
  # Torch Compile
182
189
  SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
@@ -228,12 +235,16 @@ class Envs:
228
235
  SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
229
236
 
230
237
  # Overlap Spec V2
238
+ SGLANG_ENABLE_SPEC_V2 = EnvBool(False)
231
239
  SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False)
232
240
 
233
241
  # VLM
234
242
  SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
235
243
  SGLANG_RESIZE_RESAMPLE = EnvStr("")
236
244
 
245
+ # Release & Resume Memory
246
+ SGLANG_MEMORY_SAVER_CUDA_GRAPH = EnvBool(False)
247
+
237
248
  # Ktransformers
238
249
  SGLANG_KT_MOE_NUM_GPU_EXPERTS = EnvInt(None)
239
250
  SGLANG_KT_MOE_CPUINFER = EnvInt(None)
@@ -251,7 +262,17 @@ class Envs:
251
262
  envs = Envs()
252
263
 
253
264
 
265
+ def _print_deprecated_env(new_name: str, old_name: str):
266
+ if old_name in os.environ:
267
+ warnings.warn(
268
+ f"Environment variable {old_name} will be deprecated, please use {new_name} instead"
269
+ )
270
+ os.environ[new_name] = os.environ[old_name]
271
+
272
+
254
273
  def _convert_SGL_to_SGLANG():
274
+ _print_deprecated_env("SGLANG_LOG_GC", "SGLANG_GC_LOG")
275
+
255
276
  for key, value in os.environ.items():
256
277
  if key.startswith("SGL_"):
257
278
  new_key = key.replace("SGL_", "SGLANG_", 1)