sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. sglang/bench_one_batch.py +149 -34
  2. sglang/bench_serving.py +73 -14
  3. sglang/compile_deep_gemm.py +13 -7
  4. sglang/launch_server.py +2 -0
  5. sglang/srt/batch_invariant_ops/__init__.py +2 -0
  6. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
  7. sglang/srt/checkpoint_engine/__init__.py +9 -0
  8. sglang/srt/checkpoint_engine/update.py +317 -0
  9. sglang/srt/compilation/backend.py +1 -1
  10. sglang/srt/configs/__init__.py +2 -0
  11. sglang/srt/configs/deepseek_ocr.py +542 -10
  12. sglang/srt/configs/deepseekvl2.py +95 -194
  13. sglang/srt/configs/kimi_linear.py +160 -0
  14. sglang/srt/configs/mamba_utils.py +66 -0
  15. sglang/srt/configs/model_config.py +30 -7
  16. sglang/srt/constants.py +7 -0
  17. sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
  18. sglang/srt/disaggregation/decode.py +34 -6
  19. sglang/srt/disaggregation/nixl/conn.py +2 -2
  20. sglang/srt/disaggregation/prefill.py +25 -3
  21. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
  22. sglang/srt/distributed/parallel_state.py +9 -12
  23. sglang/srt/entrypoints/engine.py +31 -20
  24. sglang/srt/entrypoints/grpc_server.py +0 -1
  25. sglang/srt/entrypoints/http_server.py +94 -94
  26. sglang/srt/entrypoints/openai/protocol.py +7 -1
  27. sglang/srt/entrypoints/openai/serving_chat.py +42 -0
  28. sglang/srt/entrypoints/openai/serving_completions.py +10 -0
  29. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  30. sglang/srt/environ.py +23 -2
  31. sglang/srt/eplb/expert_distribution.py +64 -1
  32. sglang/srt/eplb/expert_location.py +106 -36
  33. sglang/srt/function_call/function_call_parser.py +2 -0
  34. sglang/srt/function_call/minimax_m2.py +367 -0
  35. sglang/srt/grpc/compile_proto.py +3 -0
  36. sglang/srt/layers/activation.py +6 -0
  37. sglang/srt/layers/attention/ascend_backend.py +233 -5
  38. sglang/srt/layers/attention/attention_registry.py +3 -0
  39. sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
  40. sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
  41. sglang/srt/layers/attention/fla/kda.py +1359 -0
  42. sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
  43. sglang/srt/layers/attention/flashattention_backend.py +19 -8
  44. sglang/srt/layers/attention/flashinfer_backend.py +10 -1
  45. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
  46. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  47. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
  48. sglang/srt/layers/attention/mamba/mamba.py +20 -11
  49. sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
  50. sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
  51. sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
  52. sglang/srt/layers/attention/nsa/transform_index.py +1 -1
  53. sglang/srt/layers/attention/nsa_backend.py +157 -23
  54. sglang/srt/layers/attention/triton_backend.py +4 -1
  55. sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
  56. sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
  57. sglang/srt/layers/attention/utils.py +78 -0
  58. sglang/srt/layers/communicator.py +24 -1
  59. sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
  60. sglang/srt/layers/layernorm.py +35 -6
  61. sglang/srt/layers/logits_processor.py +9 -20
  62. sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
  63. sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
  64. sglang/srt/layers/moe/ep_moe/layer.py +78 -289
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
  67. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
  68. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
  69. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
  70. sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
  71. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
  72. sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
  73. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  74. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  75. sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
  76. sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
  77. sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
  78. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  79. sglang/srt/layers/moe/topk.py +35 -10
  80. sglang/srt/layers/moe/utils.py +3 -4
  81. sglang/srt/layers/pooler.py +21 -2
  82. sglang/srt/layers/quantization/__init__.py +13 -84
  83. sglang/srt/layers/quantization/auto_round.py +394 -0
  84. sglang/srt/layers/quantization/awq.py +0 -3
  85. sglang/srt/layers/quantization/base_config.py +7 -0
  86. sglang/srt/layers/quantization/fp8.py +68 -63
  87. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  88. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  89. sglang/srt/layers/quantization/gguf.py +566 -0
  90. sglang/srt/layers/quantization/modelopt_quant.py +168 -11
  91. sglang/srt/layers/quantization/mxfp4.py +30 -38
  92. sglang/srt/layers/quantization/unquant.py +23 -45
  93. sglang/srt/layers/quantization/w4afp8.py +38 -2
  94. sglang/srt/layers/radix_attention.py +5 -2
  95. sglang/srt/layers/rotary_embedding.py +130 -46
  96. sglang/srt/layers/sampler.py +12 -1
  97. sglang/srt/lora/lora_registry.py +9 -0
  98. sglang/srt/managers/async_mm_data_processor.py +122 -0
  99. sglang/srt/managers/data_parallel_controller.py +30 -3
  100. sglang/srt/managers/detokenizer_manager.py +3 -0
  101. sglang/srt/managers/io_struct.py +29 -4
  102. sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
  103. sglang/srt/managers/schedule_batch.py +74 -15
  104. sglang/srt/managers/scheduler.py +185 -144
  105. sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
  106. sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
  107. sglang/srt/managers/scheduler_pp_mixin.py +7 -2
  108. sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
  109. sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
  110. sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
  111. sglang/srt/managers/session_controller.py +6 -5
  112. sglang/srt/managers/tokenizer_manager.py +165 -78
  113. sglang/srt/managers/tp_worker.py +24 -1
  114. sglang/srt/mem_cache/base_prefix_cache.py +23 -4
  115. sglang/srt/mem_cache/common.py +1 -0
  116. sglang/srt/mem_cache/hicache_storage.py +7 -1
  117. sglang/srt/mem_cache/memory_pool.py +253 -57
  118. sglang/srt/mem_cache/memory_pool_host.py +12 -5
  119. sglang/srt/mem_cache/radix_cache.py +4 -0
  120. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  121. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
  122. sglang/srt/metrics/collector.py +46 -3
  123. sglang/srt/model_executor/cuda_graph_runner.py +15 -3
  124. sglang/srt/model_executor/forward_batch_info.py +55 -14
  125. sglang/srt/model_executor/model_runner.py +77 -170
  126. sglang/srt/model_executor/npu_graph_runner.py +7 -3
  127. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
  128. sglang/srt/model_loader/weight_utils.py +1 -1
  129. sglang/srt/models/bailing_moe.py +9 -2
  130. sglang/srt/models/deepseek_nextn.py +11 -2
  131. sglang/srt/models/deepseek_v2.py +296 -78
  132. sglang/srt/models/glm4.py +391 -77
  133. sglang/srt/models/glm4_moe.py +322 -354
  134. sglang/srt/models/glm4_moe_nextn.py +4 -14
  135. sglang/srt/models/glm4v.py +196 -55
  136. sglang/srt/models/glm4v_moe.py +29 -197
  137. sglang/srt/models/gpt_oss.py +1 -10
  138. sglang/srt/models/kimi_linear.py +678 -0
  139. sglang/srt/models/llama4.py +1 -1
  140. sglang/srt/models/llama_eagle3.py +11 -1
  141. sglang/srt/models/longcat_flash.py +2 -2
  142. sglang/srt/models/minimax_m2.py +922 -0
  143. sglang/srt/models/nvila.py +355 -0
  144. sglang/srt/models/nvila_lite.py +184 -0
  145. sglang/srt/models/qwen2.py +23 -2
  146. sglang/srt/models/qwen2_moe.py +30 -15
  147. sglang/srt/models/qwen3.py +35 -5
  148. sglang/srt/models/qwen3_moe.py +18 -12
  149. sglang/srt/models/qwen3_next.py +7 -0
  150. sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
  151. sglang/srt/multimodal/processors/base_processor.py +1 -0
  152. sglang/srt/multimodal/processors/glm4v.py +1 -1
  153. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  154. sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
  155. sglang/srt/multiplex/multiplexing_mixin.py +209 -0
  156. sglang/srt/multiplex/pdmux_context.py +164 -0
  157. sglang/srt/parser/conversation.py +7 -1
  158. sglang/srt/parser/reasoning_parser.py +28 -1
  159. sglang/srt/sampling/custom_logit_processor.py +67 -1
  160. sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
  161. sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
  162. sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
  163. sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
  164. sglang/srt/server_args.py +459 -199
  165. sglang/srt/single_batch_overlap.py +2 -4
  166. sglang/srt/speculative/draft_utils.py +16 -0
  167. sglang/srt/speculative/eagle_info.py +42 -36
  168. sglang/srt/speculative/eagle_info_v2.py +68 -25
  169. sglang/srt/speculative/eagle_utils.py +261 -16
  170. sglang/srt/speculative/eagle_worker.py +11 -3
  171. sglang/srt/speculative/eagle_worker_v2.py +15 -9
  172. sglang/srt/speculative/spec_info.py +305 -31
  173. sglang/srt/speculative/spec_utils.py +44 -8
  174. sglang/srt/tracing/trace.py +121 -12
  175. sglang/srt/utils/common.py +142 -74
  176. sglang/srt/utils/hf_transformers_utils.py +38 -12
  177. sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
  178. sglang/test/kits/radix_cache_server_kit.py +50 -0
  179. sglang/test/runners.py +31 -7
  180. sglang/test/simple_eval_common.py +5 -3
  181. sglang/test/simple_eval_humaneval.py +1 -0
  182. sglang/test/simple_eval_math.py +1 -0
  183. sglang/test/simple_eval_mmlu.py +1 -0
  184. sglang/test/simple_eval_mmmu_vlm.py +1 -0
  185. sglang/test/test_deterministic.py +235 -12
  186. sglang/test/test_deterministic_utils.py +2 -1
  187. sglang/test/test_utils.py +7 -1
  188. sglang/version.py +1 -1
  189. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
  190. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
  191. sglang/srt/models/vila.py +0 -306
  192. /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
  193. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
  194. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
  195. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
@@ -15,6 +15,8 @@
15
15
 
16
16
  from __future__ import annotations
17
17
 
18
+ import base64
19
+ import json
18
20
  import logging
19
21
  import os
20
22
  import random
@@ -24,6 +26,8 @@ import uuid
24
26
  from dataclasses import dataclass
25
27
  from typing import TYPE_CHECKING, Any, Dict, List, Optional
26
28
 
29
+ from sglang.srt.utils import get_int_env_var
30
+
27
31
  if TYPE_CHECKING:
28
32
  from sglang.srt.managers.scheduler import Req
29
33
 
@@ -85,6 +89,8 @@ class SglangTraceReqContext:
85
89
  # Indicates whether this instance is a replica from the main process.
86
90
  # When True, root_span is None and only root_span_context is preserved.
87
91
  is_copy: bool = False
92
+ bootstrap_room_span: Optional[trace.span.Span] = None
93
+ bootstrap_room_span_context: Optional[context.Context] = None
88
94
  root_span: Optional[trace.span.Span] = None
89
95
  root_span_context: Optional[context.Context] = None
90
96
 
@@ -96,8 +102,7 @@ class SglangTracePropagateContext:
96
102
 
97
103
  def to_dict(self):
98
104
  carrier: dict[str, str] = {}
99
- context.attach(self.root_span_context)
100
- propagate.inject(carrier)
105
+ propagate.inject(carrier, self.root_span_context)
101
106
 
102
107
  if self.prev_span_context:
103
108
  return {
@@ -149,6 +154,7 @@ class SglangTraceCustomIdGenerator(id_generator.IdGenerator):
149
154
 
150
155
 
151
156
  # global variables
157
+ remote_trace_contexts: Dict[str, SglangTracePropagateContext] = {}
152
158
  threads_info: Dict[int, SglangTraceThreadInfo] = {}
153
159
  reqs_context: Dict[str, SglangTraceReqContext] = {}
154
160
 
@@ -193,8 +199,17 @@ def process_tracing_init(otlp_endpoint, server_name):
193
199
  resource=resource, id_generator=SglangTraceCustomIdGenerator()
194
200
  )
195
201
 
202
+ schedule_delay_millis = get_int_env_var(
203
+ "SGLANG_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS", 500
204
+ )
205
+ max_export_batch_size = get_int_env_var(
206
+ "SGLANG_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE", 64
207
+ )
208
+
196
209
  processor = BatchSpanProcessor(
197
- OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
210
+ OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True),
211
+ schedule_delay_millis=schedule_delay_millis,
212
+ max_export_batch_size=max_export_batch_size,
198
213
  )
199
214
  tracer_provider.add_span_processor(processor)
200
215
  trace.set_tracer_provider(tracer_provider)
@@ -266,7 +281,9 @@ def __create_thread_context(pid, req_span_context, ts: Optional[int] = None):
266
281
  return thread_context
267
282
 
268
283
 
269
- def trace_get_proc_propagate_context(rid) -> Optional[Dict[str, Any]]:
284
+ def trace_get_proc_propagate_context(
285
+ rid, remote_propagate=False
286
+ ) -> Optional[Dict[str, Any]]:
270
287
  if not tracing_enabled:
271
288
  return None
272
289
 
@@ -283,9 +300,11 @@ def trace_get_proc_propagate_context(rid) -> Optional[Dict[str, Any]]:
283
300
  elif thread_context.last_span_context:
284
301
  prev_span_context = thread_context.last_span_context
285
302
 
286
- trace_context = SglangTracePropagateContext(
287
- reqs_context[rid].root_span_context, prev_span_context
288
- )
303
+ root_span_context = reqs_context[rid].root_span_context
304
+ if remote_propagate:
305
+ root_span_context = reqs_context[rid].bootstrap_room_span_context
306
+
307
+ trace_context = SglangTracePropagateContext(root_span_context, prev_span_context)
289
308
  return trace_context.to_dict()
290
309
 
291
310
 
@@ -327,10 +346,54 @@ def trace_set_proc_propagate_context(rid, trace_context: Optional[Dict[str, Any]
327
346
  ].last_span_context = trace_context.prev_span_context
328
347
 
329
348
 
349
+ def trace_get_remote_propagate_context(bootstrap_room_list: List[str]):
350
+ if not tracing_enabled:
351
+ return ""
352
+
353
+ reqs_trace_contexts = {}
354
+ for bootstrap_room in bootstrap_room_list:
355
+ # In the router, rid is also the bootstrap room.
356
+ bootstrap_room = str(bootstrap_room)
357
+
358
+ if bootstrap_room not in reqs_context:
359
+ continue
360
+
361
+ _context = trace_get_proc_propagate_context(
362
+ bootstrap_room, remote_propagate=True
363
+ )
364
+ reqs_trace_contexts[bootstrap_room] = _context
365
+
366
+ json_str = json.dumps(reqs_trace_contexts, ensure_ascii=False)
367
+ return base64.b64encode(json_str.encode("utf-8")).decode("utf-8")
368
+
369
+
370
+ def trace_set_remote_propagate_context(base64_str):
371
+ if not tracing_enabled:
372
+ return
373
+
374
+ if base64_str is None or base64_str == "" or base64_str == "None":
375
+ return
376
+
377
+ base64_bytes = base64.b64decode(base64_str)
378
+ json_str = base64_bytes.decode("utf-8")
379
+ remote_reqs_trace_contexts = json.loads(json_str)
380
+
381
+ for bootstrap_room in remote_reqs_trace_contexts:
382
+ if bootstrap_room in remote_trace_contexts:
383
+ continue
384
+
385
+ remote_trace_contexts[bootstrap_room] = (
386
+ SglangTracePropagateContext.instance_from_dict(
387
+ remote_reqs_trace_contexts[bootstrap_room]
388
+ )
389
+ )
390
+
391
+
330
392
  def trace_req_start(
331
393
  rid: str,
332
394
  bootstrap_room: Optional[int] = None,
333
395
  ts: Optional[int] = None,
396
+ role: Optional[str] = "null",
334
397
  ):
335
398
  if not tracing_enabled:
336
399
  return
@@ -344,6 +407,7 @@ def trace_req_start(
344
407
  return
345
408
 
346
409
  # create req context and root span
410
+ bootstrap_room = 0 if bootstrap_room is None else bootstrap_room
347
411
  reqs_context[rid] = SglangTraceReqContext(
348
412
  rid=rid,
349
413
  start_time_ns=ts,
@@ -352,23 +416,42 @@ def trace_req_start(
352
416
  is_copy=False,
353
417
  )
354
418
 
419
+ # create bootstrap room span
420
+ tracer = threads_info[pid].tracer
421
+ if str(bootstrap_room) not in remote_trace_contexts:
422
+ attrs = {"bootstrap_room": str(hex(bootstrap_room))}
423
+ bootstrap_room_span = tracer.start_span(
424
+ name=f"Bootstrap Room {hex(bootstrap_room)}",
425
+ start_time=ts,
426
+ attributes=attrs,
427
+ )
428
+ reqs_context[rid].bootstrap_room_span = bootstrap_room_span
429
+ bootstrap_room_span_context = trace.set_span_in_context(bootstrap_room_span)
430
+ else:
431
+ bootstrap_room_span_context = remote_trace_contexts[
432
+ str(bootstrap_room)
433
+ ].root_span_context
434
+
355
435
  # Drop the worker_id added by MultiTokenizer
356
436
  orig_rid = rid.split("_")[-1]
357
- tracer = threads_info[pid].tracer
437
+ role = "" if role == "null" else role
438
+ attrs = {"rid": orig_rid}
358
439
  root_span = tracer.start_span(
359
- name=f"Req {orig_rid[:8]}",
440
+ name=f"{role} Req {orig_rid[:8]}",
360
441
  start_time=ts,
442
+ context=bootstrap_room_span_context,
443
+ attributes=attrs,
361
444
  )
362
445
 
363
446
  root_span.set_attributes(
364
447
  {
365
448
  "rid": rid,
366
- "bootstrap_room": bootstrap_room if bootstrap_room else "None",
367
449
  }
368
450
  )
369
451
 
370
452
  reqs_context[rid].root_span = root_span
371
453
  reqs_context[rid].root_span_context = trace.set_span_in_context(root_span)
454
+ reqs_context[rid].bootstrap_room_span_context = bootstrap_room_span_context
372
455
 
373
456
  # create thread context and thread span
374
457
  reqs_context[rid].threads_context[pid] = __create_thread_context(
@@ -376,6 +459,10 @@ def trace_req_start(
376
459
  reqs_context[rid].root_span_context,
377
460
  ts,
378
461
  )
462
+ if str(bootstrap_room) in remote_trace_contexts:
463
+ reqs_context[rid].threads_context[pid].last_span_context = (
464
+ remote_trace_contexts[str(bootstrap_room)].prev_span_context
465
+ )
379
466
 
380
467
 
381
468
  def trace_req_finish(
@@ -399,6 +486,10 @@ def trace_req_finish(
399
486
  req_context.root_span.set_attributes(attrs)
400
487
 
401
488
  req_context.root_span.end(end_time=ts)
489
+ if str(req_context.bootstrap_room) in remote_trace_contexts:
490
+ del remote_trace_contexts[str(req_context.bootstrap_room)]
491
+ else:
492
+ req_context.bootstrap_room_span.end(end_time=ts)
402
493
 
403
494
  del reqs_context[rid]
404
495
 
@@ -518,7 +609,9 @@ trace_slice = trace_slice_end
518
609
 
519
610
 
520
611
  # Add event to the current slice on the same thread with the same rid.
521
- def trace_event(name: str, rid: str, ts: Optional[int] = None):
612
+ def trace_event(
613
+ name: str, rid: str, ts: Optional[int] = None, attrs: Dict[str, Any] = None
614
+ ):
522
615
  if not tracing_enabled:
523
616
  return
524
617
 
@@ -539,7 +632,7 @@ def trace_event(name: str, rid: str, ts: Optional[int] = None):
539
632
  ts = ts or __get_cur_time_ns()
540
633
 
541
634
  slice_info = thread_context.cur_slice_stack[-1]
542
- slice_info.span.add_event(name=name, timestamp=ts)
635
+ slice_info.span.add_event(name=name, timestamp=ts, attributes=attrs)
543
636
 
544
637
 
545
638
  # Add attrs to the current slice on the same thread with the same rid.
@@ -569,6 +662,9 @@ def trace_slice_batch(
569
662
  name: str,
570
663
  reqs: List[Req],
571
664
  ):
665
+ if not tracing_enabled:
666
+ return
667
+
572
668
  for req in reqs:
573
669
  trace_slice(
574
670
  name,
@@ -576,3 +672,16 @@ def trace_slice_batch(
576
672
  auto_next_anon=not req.finished(),
577
673
  thread_finish_flag=req.finished(),
578
674
  )
675
+
676
+
677
+ def trace_event_batch(
678
+ name: str,
679
+ reqs: List[Req],
680
+ ts: Optional[int] = None,
681
+ attrs: Dict[str, Any] = None,
682
+ ):
683
+ if not tracing_enabled:
684
+ return
685
+
686
+ for req in reqs:
687
+ trace_event(name, req.rid, ts=ts, attrs=attrs)
@@ -56,7 +56,6 @@ from json import JSONDecodeError
56
56
  from multiprocessing.reduction import ForkingPickler
57
57
  from pathlib import Path
58
58
  from typing import (
59
- TYPE_CHECKING,
60
59
  Any,
61
60
  Callable,
62
61
  Dict,
@@ -94,9 +93,6 @@ from typing_extensions import Literal
94
93
  from sglang.srt.environ import envs
95
94
  from sglang.srt.metrics.func_timer import enable_func_timer
96
95
 
97
- if TYPE_CHECKING:
98
- from sglang.srt.layers.quantization.base_config import QuantizeMethodBase
99
-
100
96
  logger = logging.getLogger(__name__)
101
97
 
102
98
  show_time_cost = False
@@ -138,6 +134,7 @@ def is_xpu() -> bool:
138
134
  return hasattr(torch, "xpu") and torch.xpu.is_available()
139
135
 
140
136
 
137
+ @lru_cache(maxsize=1)
141
138
  def is_npu() -> bool:
142
139
  return hasattr(torch, "npu") and torch.npu.is_available()
143
140
 
@@ -191,7 +188,16 @@ is_hopper_with_cuda_12_3 = lambda: _check(9)
191
188
  def is_blackwell():
192
189
  if not is_cuda():
193
190
  return False
194
- return torch.cuda.get_device_capability()[0] == 10
191
+ return torch.cuda.get_device_capability()[0] in [10, 12]
192
+
193
+
194
+ @lru_cache(maxsize=1)
195
+ def is_blackwell_supported(device=None) -> bool:
196
+ if not is_cuda_alike():
197
+ return False
198
+ return (torch.cuda.get_device_capability(device)[0] in [10, 12]) and (
199
+ torch.version.cuda >= "12.8"
200
+ )
195
201
 
196
202
 
197
203
  @lru_cache(maxsize=1)
@@ -1069,32 +1075,6 @@ def monkey_patch_p2p_access_check():
1069
1075
  setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
1070
1076
 
1071
1077
 
1072
- def monkey_patch_vllm_gguf_config():
1073
- try:
1074
- from vllm.model_executor.layers.quantization.gguf import (
1075
- GGUFConfig,
1076
- GGUFEmbeddingMethod,
1077
- GGUFLinearMethod,
1078
- )
1079
- except ImportError:
1080
- return
1081
-
1082
- from sglang.srt.layers.linear import LinearBase
1083
- from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
1084
-
1085
- def get_quant_method_with_embedding_replaced(
1086
- self, layer: torch.nn.Module, prefix: str
1087
- ) -> Optional[QuantizeMethodBase]:
1088
- if isinstance(layer, LinearBase):
1089
- return GGUFLinearMethod(self)
1090
- elif isinstance(layer, VocabParallelEmbedding):
1091
- # patch to own VocabParallelEmbedding
1092
- return GGUFEmbeddingMethod(self)
1093
- return None
1094
-
1095
- setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
1096
-
1097
-
1098
1078
  def set_ulimit(target_soft_limit=65535):
1099
1079
  # number of open files
1100
1080
  resource_type = resource.RLIMIT_NOFILE
@@ -1131,9 +1111,9 @@ def add_api_key_middleware(app, api_key: str):
1131
1111
  async def authentication(request, call_next):
1132
1112
  if request.method == "OPTIONS":
1133
1113
  return await call_next(request)
1134
- if request.url.path.startswith("/health"):
1135
- return await call_next(request)
1136
- if request.url.path.startswith("/metrics"):
1114
+ if request.url.path.startswith("/health") or request.url.path.startswith(
1115
+ "/metrics"
1116
+ ):
1137
1117
  return await call_next(request)
1138
1118
  if request.headers.get("Authorization") != "Bearer " + api_key:
1139
1119
  return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
@@ -1259,42 +1239,34 @@ def point_to_point_pyobj(
1259
1239
  dst: int = 1,
1260
1240
  ):
1261
1241
  """Send data from src to dst in group using DeviceToDevice communication."""
1262
-
1242
+ device = torch.get_device_module().current_device()
1263
1243
  if rank == src:
1264
1244
  if len(data) == 0:
1265
- tensor_size = torch.tensor(
1266
- [0], dtype=torch.long, device=torch.cuda.current_device()
1267
- )
1245
+ tensor_size = torch.tensor([0], dtype=torch.long, device=device)
1268
1246
  dist.send(tensor_size, dst=dst, group=group)
1269
1247
  else:
1270
1248
  serialized_data = pickle.dumps(data)
1271
1249
  size = len(serialized_data)
1272
1250
  tensor_data = torch.ByteTensor(
1273
1251
  np.frombuffer(serialized_data, dtype=np.uint8)
1274
- ).cuda(
1275
- device=torch.cuda.current_device()
1252
+ ).to(
1253
+ device=device
1276
1254
  ) # Move to GPU
1277
- tensor_size = torch.tensor(
1278
- [size], dtype=torch.long, device=torch.cuda.current_device()
1279
- )
1255
+ tensor_size = torch.tensor([size], dtype=torch.long, device=device)
1280
1256
 
1281
1257
  dist.send(tensor_size, dst=dst, group=group)
1282
1258
  dist.send(tensor_data, dst=dst, group=group)
1283
1259
  return data
1284
1260
 
1285
1261
  elif rank == dst:
1286
- tensor_size = torch.tensor(
1287
- [0], dtype=torch.long, device=torch.cuda.current_device()
1288
- )
1262
+ tensor_size = torch.tensor([0], dtype=torch.long, device=device)
1289
1263
  dist.recv(tensor_size, src=src, group=group)
1290
1264
  size = tensor_size.item()
1291
1265
 
1292
1266
  if size == 0:
1293
1267
  return []
1294
1268
 
1295
- tensor_data = torch.empty(
1296
- size, dtype=torch.uint8, device=torch.cuda.current_device()
1297
- )
1269
+ tensor_data = torch.empty(size, dtype=torch.uint8, device=device)
1298
1270
  dist.recv(tensor_data, src=src, group=group)
1299
1271
 
1300
1272
  serialized_data = bytes(
@@ -2106,7 +2078,7 @@ class MultiprocessingSerializer:
2106
2078
 
2107
2079
  if output_str:
2108
2080
  # Convert bytes to base64-encoded string
2109
- pybase64.b64encode(output).decode("utf-8")
2081
+ output = pybase64.b64encode(output).decode("utf-8")
2110
2082
 
2111
2083
  return output
2112
2084
 
@@ -2125,7 +2097,78 @@ class MultiprocessingSerializer:
2125
2097
  # Decode base64 string to bytes
2126
2098
  data = pybase64.b64decode(data, validate=True)
2127
2099
 
2128
- return ForkingPickler.loads(data)
2100
+ return SafeUnpickler(io.BytesIO(data)).load()
2101
+
2102
+
2103
+ class SafeUnpickler(pickle.Unpickler):
2104
+ ALLOWED_MODULE_PREFIXES = {
2105
+ # --- Python types ---
2106
+ "builtins.",
2107
+ "collections.",
2108
+ "copyreg.",
2109
+ "functools.",
2110
+ "itertools.",
2111
+ "operator.",
2112
+ "types.",
2113
+ "weakref.",
2114
+ # --- PyTorch types ---
2115
+ "torch.",
2116
+ "torch._tensor.",
2117
+ "torch.storage.",
2118
+ "torch.nn.parameter.",
2119
+ "torch.autograd.function.",
2120
+ # --- torch distributed ---
2121
+ "torch.distributed.",
2122
+ "torch.distributed._shard.",
2123
+ "torch.distributed._composable.",
2124
+ "torch._C._distributed_c10d.",
2125
+ "torch._C._distributed_fsdp.",
2126
+ "torch.distributed.optim.",
2127
+ # --- multiprocessing ---
2128
+ "multiprocessing.resource_sharer.",
2129
+ "multiprocessing.reduction.",
2130
+ "pickletools.",
2131
+ # --- PEFT / LoRA ---
2132
+ "peft.",
2133
+ "transformers.",
2134
+ "huggingface_hub.",
2135
+ # --- SGLang & Unitest ---
2136
+ "sglang.srt.weight_sync.tensor_bucket.",
2137
+ "sglang.srt.model_executor.model_runner.",
2138
+ "sglang.srt.layers.",
2139
+ "sglang.srt.utils.",
2140
+ }
2141
+
2142
+ DENY_CLASSES = {
2143
+ ("builtins", "eval"),
2144
+ ("builtins", "exec"),
2145
+ ("builtins", "compile"),
2146
+ ("os", "system"),
2147
+ ("subprocess", "Popen"),
2148
+ ("subprocess", "run"),
2149
+ ("codecs", "decode"),
2150
+ ("types", "CodeType"),
2151
+ ("types", "FunctionType"),
2152
+ }
2153
+
2154
+ def find_class(self, module, name):
2155
+ # Block deterministic attacks
2156
+ if (module, name) in self.DENY_CLASSES:
2157
+ raise RuntimeError(
2158
+ f"Blocked unsafe class loading ({module}.{name}), "
2159
+ f"to prevent exploitation of CVE-2025-10164"
2160
+ )
2161
+ # Allowlist of safe-to-load modules.
2162
+ if any(
2163
+ (module + ".").startswith(prefix) for prefix in self.ALLOWED_MODULE_PREFIXES
2164
+ ):
2165
+ return super().find_class(module, name)
2166
+
2167
+ # Block everything else. (Potential attack surface)
2168
+ raise RuntimeError(
2169
+ f"Blocked unsafe class loading ({module}.{name}), "
2170
+ f"to prevent exploitation of CVE-2025-10164"
2171
+ )
2129
2172
 
2130
2173
 
2131
2174
  def debug_timing(func):
@@ -2308,16 +2351,24 @@ def launch_dummy_health_check_server(host, port, enable_metrics):
2308
2351
  )
2309
2352
  server = uvicorn.Server(config=config)
2310
2353
 
2311
- try:
2312
- loop = asyncio.get_running_loop()
2313
- logger.info(
2314
- f"Dummy health check server scheduled on existing loop at {host}:{port}"
2315
- )
2316
- loop.create_task(server.serve())
2354
+ # Run server in a background daemon thread with its own event loop
2355
+ # This prevents blocking the main thread while still serving health checks
2356
+ def run_server():
2357
+ try:
2358
+ asyncio.run(server.serve())
2359
+ except Exception as e:
2360
+ logger.error(f"Dummy health check server failed to start: {e}")
2361
+ raise
2362
+ finally:
2363
+ logger.info(f"Dummy health check server stopped at {host}:{port}")
2317
2364
 
2318
- except RuntimeError:
2319
- logger.info(f"Starting dummy health check server at {host}:{port}")
2320
- server.run()
2365
+ thread = threading.Thread(
2366
+ target=run_server, daemon=True, name="health-check-server"
2367
+ )
2368
+ thread.start()
2369
+ logger.info(
2370
+ f"Dummy health check server started in background thread at {host}:{port}"
2371
+ )
2321
2372
 
2322
2373
 
2323
2374
  def create_checksum(directory: str):
@@ -2578,17 +2629,12 @@ def get_local_ip_auto(fallback: str = None) -> str:
2578
2629
  raise ValueError("Can not get local ip")
2579
2630
 
2580
2631
 
2581
- def is_page_size_one(server_args):
2582
- return server_args.page_size == 1
2583
-
2584
-
2585
2632
  # TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
2586
2633
  # TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
2587
2634
  def is_no_spec_infer_or_topk_one(server_args):
2588
2635
  return server_args.speculative_eagle_topk is None or (
2589
- server_args.speculative_eagle_topk is not None
2590
- and server_args.speculative_eagle_topk == 1
2591
- and is_page_size_one(server_args)
2636
+ server_args.speculative_eagle_topk == 1
2637
+ and (server_args.page_size == 1 or server_args.page_size is None)
2592
2638
  )
2593
2639
 
2594
2640
 
@@ -3068,12 +3114,16 @@ def apply_module_patch(target_module, target_function, wrappers):
3068
3114
  setattr(original_module, target_function, candidate)
3069
3115
 
3070
3116
  for key, value in sys.modules.copy().items():
3071
- if (
3072
- target_function is not None
3073
- and hasattr(value, target_function)
3074
- and id(getattr(value, target_function)) == original_function_id
3075
- ):
3076
- setattr(value, target_function, candidate)
3117
+ try:
3118
+ if (
3119
+ target_function is not None
3120
+ and hasattr(value, target_function)
3121
+ and id(getattr(value, target_function)) == original_function_id
3122
+ ):
3123
+ setattr(value, target_function, candidate)
3124
+ except ImportError as e:
3125
+ # Ignore some modules reporting ImportError when calling hasattr
3126
+ logger.warning(f"Ignore {value} reports ImportError with:\n{str(e)}")
3077
3127
 
3078
3128
 
3079
3129
  def parse_module_path(module_path, function_name, create_dummy):
@@ -3525,6 +3575,24 @@ def cached_triton_kernel(key_fn=None):
3525
3575
  """
3526
3576
 
3527
3577
  def decorator(fn):
3528
- return CachedKernel(fn, key_fn)
3578
+ if envs.SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE.get():
3579
+ logger.debug(
3580
+ f"{envs.SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE.name} = True. Using custom triton kernel cache."
3581
+ )
3582
+ return CachedKernel(fn, key_fn)
3583
+ else:
3584
+ # Fallback to the native triton cache.
3585
+ logger.debug(
3586
+ f"{envs.SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE.name} = False. Using native triton kernel cache."
3587
+ )
3588
+ return fn
3529
3589
 
3530
3590
  return decorator
3591
+
3592
+
3593
+ # Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
3594
+ def calc_diff(x, y):
3595
+ x, y = x.double(), y.double()
3596
+ denominator = (x * x + y * y).sum()
3597
+ sim = 2 * (x * y).sum() / denominator
3598
+ return 1 - sim
@@ -43,6 +43,7 @@ from sglang.srt.configs import (
43
43
  DotsVLMConfig,
44
44
  ExaoneConfig,
45
45
  FalconH1Config,
46
+ KimiLinearConfig,
46
47
  KimiVLConfig,
47
48
  LongcatFlashConfig,
48
49
  MultiModalityConfig,
@@ -54,6 +55,7 @@ from sglang.srt.configs import (
54
55
  from sglang.srt.configs.deepseek_ocr import DeepseekVLV2Config
55
56
  from sglang.srt.configs.internvl import InternVLChatConfig
56
57
  from sglang.srt.connector import create_remote_connector
58
+ from sglang.srt.multimodal.customized_mm_processor_utils import _CUSTOMIZED_MM_PROCESSOR
57
59
  from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset
58
60
 
59
61
  _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
@@ -67,6 +69,7 @@ _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
67
69
  Step3VLConfig,
68
70
  LongcatFlashConfig,
69
71
  Olmo3Config,
72
+ KimiLinearConfig,
70
73
  Qwen3NextConfig,
71
74
  FalconH1Config,
72
75
  DotsVLMConfig,
@@ -172,6 +175,16 @@ def _load_deepseek_v32_model(
172
175
  )
173
176
 
174
177
 
178
+ def _is_deepseek_ocr_model(config: PretrainedConfig) -> bool:
179
+ # TODO: Remove this workaround related when AutoConfig correctly identifies deepseek-ocr.
180
+ # Hugging Face's AutoConfig currently misidentifies it as deepseekvl2.
181
+ return (
182
+ getattr(config, "auto_map", None) is not None
183
+ and config.auto_map.get("AutoModel")
184
+ == "modeling_deepseekocr.DeepseekOCRForCausalLM"
185
+ )
186
+
187
+
175
188
  @lru_cache_frozenset(maxsize=32)
176
189
  def get_config(
177
190
  model: str,
@@ -197,10 +210,6 @@ def get_config(
197
210
  config = AutoConfig.from_pretrained(
198
211
  model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
199
212
  )
200
- if "deepseek-ai/DeepSeek-OCR" in model:
201
- config.model_type = "deepseek-ocr"
202
- # Due to an unknown reason, Hugging Face’s AutoConfig mistakenly recognizes the configuration of deepseek-ocr as deepseekvl2.
203
- # This is a temporary workaround and will require further optimization.
204
213
 
205
214
  except ValueError as e:
206
215
  if not "deepseek_v32" in str(e):
@@ -237,7 +246,11 @@ def get_config(
237
246
  setattr(config, key, val)
238
247
 
239
248
  if config.model_type in _CONFIG_REGISTRY:
240
- config_class = _CONFIG_REGISTRY[config.model_type]
249
+ model_type = config.model_type
250
+ if model_type == "deepseek_vl_v2":
251
+ if _is_deepseek_ocr_model(config):
252
+ model_type = "deepseek-ocr"
253
+ config_class = _CONFIG_REGISTRY[model_type]
241
254
  config = config_class.from_pretrained(model, revision=revision)
242
255
  # NOTE(HandH1998): Qwen2VL requires `_name_or_path` attribute in `config`.
243
256
  setattr(config, "_name_or_path", model)
@@ -441,6 +454,10 @@ def get_processor(
441
454
  **kwargs,
442
455
  )
443
456
 
457
+ if _is_deepseek_ocr_model(config):
458
+ # Temporary hack for load deepseek-ocr
459
+ config.model_type = "deepseek-ocr"
460
+
444
461
  # fix: for Qwen2-VL and Sarashina2Vision models, inject default 'size' if not provided.
445
462
  if config.model_type in {"qwen2_vl", "sarashina2_vision"}:
446
463
  if "size" not in kwargs:
@@ -458,13 +475,22 @@ def get_processor(
458
475
  **kwargs,
459
476
  )
460
477
  else:
461
- processor = AutoProcessor.from_pretrained(
462
- tokenizer_name,
463
- *args,
464
- trust_remote_code=trust_remote_code,
465
- revision=revision,
466
- **kwargs,
467
- )
478
+ if config.model_type in _CUSTOMIZED_MM_PROCESSOR:
479
+ processor = _CUSTOMIZED_MM_PROCESSOR[config.model_type].from_pretrained(
480
+ tokenizer_name,
481
+ *args,
482
+ trust_remote_code=trust_remote_code,
483
+ revision=revision,
484
+ **kwargs,
485
+ )
486
+ else:
487
+ processor = AutoProcessor.from_pretrained(
488
+ tokenizer_name,
489
+ *args,
490
+ trust_remote_code=trust_remote_code,
491
+ revision=revision,
492
+ **kwargs,
493
+ )
468
494
 
469
495
  except ValueError as e:
470
496
  error_message = str(e)