sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +79 -53
  3. sglang/bench_serving.py +186 -14
  4. sglang/profiler.py +0 -1
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/longcat_flash.py +104 -0
  7. sglang/srt/configs/model_config.py +12 -0
  8. sglang/srt/connector/__init__.py +1 -1
  9. sglang/srt/connector/base_connector.py +1 -2
  10. sglang/srt/connector/redis.py +2 -2
  11. sglang/srt/connector/serde/__init__.py +1 -1
  12. sglang/srt/connector/serde/safe_serde.py +4 -3
  13. sglang/srt/conversation.py +38 -5
  14. sglang/srt/disaggregation/ascend/conn.py +75 -0
  15. sglang/srt/disaggregation/launch_lb.py +0 -13
  16. sglang/srt/disaggregation/mini_lb.py +33 -8
  17. sglang/srt/disaggregation/prefill.py +1 -1
  18. sglang/srt/distributed/parallel_state.py +24 -14
  19. sglang/srt/entrypoints/engine.py +19 -12
  20. sglang/srt/entrypoints/http_server.py +174 -34
  21. sglang/srt/entrypoints/openai/protocol.py +87 -24
  22. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  23. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  24. sglang/srt/eplb/eplb_manager.py +26 -2
  25. sglang/srt/eplb/expert_distribution.py +29 -2
  26. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  27. sglang/srt/function_call/function_call_parser.py +2 -0
  28. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  29. sglang/srt/harmony_parser.py +588 -0
  30. sglang/srt/hf_transformers_utils.py +26 -7
  31. sglang/srt/layers/activation.py +12 -0
  32. sglang/srt/layers/attention/ascend_backend.py +374 -136
  33. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  34. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  35. sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
  36. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  38. sglang/srt/layers/communicator.py +1 -2
  39. sglang/srt/layers/layernorm.py +28 -3
  40. sglang/srt/layers/linear.py +3 -2
  41. sglang/srt/layers/logits_processor.py +1 -1
  42. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  43. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  44. sglang/srt/layers/moe/ep_moe/layer.py +13 -13
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  47. sglang/srt/layers/moe/topk.py +35 -12
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  49. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  50. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  51. sglang/srt/layers/quantization/fp8.py +2 -1
  52. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  53. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  54. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  55. sglang/srt/layers/quantization/mxfp4.py +25 -27
  56. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  57. sglang/srt/layers/quantization/utils.py +13 -0
  58. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  59. sglang/srt/layers/rotary_embedding.py +28 -1
  60. sglang/srt/layers/sampler.py +29 -5
  61. sglang/srt/layers/utils.py +0 -14
  62. sglang/srt/managers/cache_controller.py +237 -204
  63. sglang/srt/managers/detokenizer_manager.py +48 -2
  64. sglang/srt/managers/io_struct.py +57 -0
  65. sglang/srt/managers/mm_utils.py +5 -1
  66. sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
  67. sglang/srt/managers/scheduler.py +94 -9
  68. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  69. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  70. sglang/srt/managers/tokenizer_manager.py +122 -42
  71. sglang/srt/mem_cache/chunk_cache.py +1 -1
  72. sglang/srt/mem_cache/hicache_storage.py +51 -23
  73. sglang/srt/mem_cache/hiradix_cache.py +87 -71
  74. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  75. sglang/srt/mem_cache/memory_pool.py +77 -14
  76. sglang/srt/mem_cache/memory_pool_host.py +4 -5
  77. sglang/srt/mem_cache/radix_cache.py +6 -4
  78. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  79. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
  80. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
  81. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  82. sglang/srt/model_executor/model_runner.py +6 -5
  83. sglang/srt/model_loader/loader.py +15 -24
  84. sglang/srt/model_loader/utils.py +12 -0
  85. sglang/srt/models/deepseek_v2.py +38 -13
  86. sglang/srt/models/gpt_oss.py +2 -15
  87. sglang/srt/models/llama_eagle3.py +4 -0
  88. sglang/srt/models/longcat_flash.py +1015 -0
  89. sglang/srt/models/longcat_flash_nextn.py +691 -0
  90. sglang/srt/models/qwen2.py +26 -3
  91. sglang/srt/models/qwen2_5_vl.py +66 -41
  92. sglang/srt/models/qwen2_moe.py +22 -2
  93. sglang/srt/models/transformers.py +1 -1
  94. sglang/srt/multimodal/processors/base_processor.py +4 -2
  95. sglang/srt/reasoning_parser.py +56 -300
  96. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  97. sglang/srt/server_args.py +122 -56
  98. sglang/srt/speculative/eagle_worker.py +28 -8
  99. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  100. sglang/srt/utils.py +73 -5
  101. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  102. sglang/version.py +1 -1
  103. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
  104. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
  105. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
  106. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
  107. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
@@ -32,11 +32,14 @@ from sglang.srt.managers.io_struct import (
32
32
  BatchStrOut,
33
33
  BatchTokenIDOut,
34
34
  FreezeGCReq,
35
+ MultiTokenizerRegisterReq,
35
36
  )
37
+ from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerMixin
36
38
  from sglang.srt.server_args import PortArgs, ServerArgs
37
39
  from sglang.srt.utils import (
38
40
  configure_logger,
39
41
  freeze_gc,
42
+ get_worker_ids_from_req_rids,
40
43
  get_zmq_socket,
41
44
  kill_itself_when_parent_died,
42
45
  )
@@ -67,7 +70,7 @@ class DecodeStatus:
67
70
  sent_offset: int = 0
68
71
 
69
72
 
70
- class DetokenizerManager:
73
+ class DetokenizerManager(MultiTokenizerMixin):
71
74
  """DetokenizerManager is a process that detokenizes the token ids."""
72
75
 
73
76
  def __init__(
@@ -102,10 +105,13 @@ class DetokenizerManager:
102
105
  (BatchEmbeddingOut, self.handle_batch_embedding_out),
103
106
  (BatchTokenIDOut, self.handle_batch_token_id_out),
104
107
  (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
108
+ (MultiTokenizerRegisterReq, lambda x: x),
105
109
  (FreezeGCReq, self.handle_freeze_gc_req),
106
110
  ]
107
111
  )
108
112
 
113
+ self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
114
+
109
115
  def event_loop(self):
110
116
  """The event loop that handles requests"""
111
117
  while True:
@@ -114,6 +120,39 @@ class DetokenizerManager:
114
120
  if output is not None:
115
121
  self.send_to_tokenizer.send_pyobj(output)
116
122
 
123
+ def multi_tokenizer_manager_event_loop(self):
124
+ """The event loop that handles requests, for multi tokenizer manager mode only"""
125
+ self.create_sockets_mapping()
126
+ while True:
127
+ recv_obj = self.recv_from_scheduler.recv_pyobj()
128
+ output = self._request_dispatcher(recv_obj)
129
+ if output is None:
130
+ continue
131
+ # Extract worker_id from rid
132
+ if isinstance(recv_obj.rids, list):
133
+ worker_ids = get_worker_ids_from_req_rids(recv_obj.rids)
134
+ else:
135
+ raise RuntimeError(
136
+ f"for tokenizer_worker_num > 1, recv_obj.rids must be a list"
137
+ )
138
+
139
+ # Send data using the corresponding socket
140
+ for i, worker_id in enumerate(worker_ids):
141
+ if isinstance(recv_obj, MultiTokenizerRegisterReq):
142
+ if self.register_tokenizer_ipc(recv_obj, worker_id):
143
+ logger.info(
144
+ f"DetokenizerManager Created ZMQ socket for worker {worker_id}"
145
+ )
146
+ continue
147
+ else:
148
+ if worker_id not in self.tokenizer_mapping:
149
+ logger.error(
150
+ f"Tokenizer Worker ID {worker_id} not registered. Check if the server Process {worker_id} is alive"
151
+ )
152
+ continue
153
+ new_output = self._handle_output_by_index(output, i)
154
+ self.tokenizer_mapping[worker_id].send_pyobj(new_output)
155
+
117
156
  def trim_matched_stop(
118
157
  self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
119
158
  ):
@@ -133,6 +172,9 @@ class DetokenizerManager:
133
172
 
134
173
  # Trim stop token.
135
174
  if isinstance(matched, int) and isinstance(output, list):
175
+ # 200012 <|call|> is the tool call token and one of eos tokens for gpt-oss model
176
+ if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
177
+ return output
136
178
  assert len(output) > 0
137
179
  return output[:-1]
138
180
  return output
@@ -280,8 +322,12 @@ def run_detokenizer_process(
280
322
 
281
323
  try:
282
324
  manager = DetokenizerManager(server_args, port_args)
283
- manager.event_loop()
325
+ if server_args.tokenizer_worker_num > 1:
326
+ manager.multi_tokenizer_manager_event_loop()
327
+ else:
328
+ manager.event_loop()
284
329
  except Exception:
330
+ manager.clear_tokenizer_mapping()
285
331
  traceback = get_exception_traceback()
286
332
  logger.error(f"DetokenizerManager hit an exception: {traceback}")
287
333
  parent_process.send_signal(signal.SIGQUIT)
@@ -533,6 +533,21 @@ class TokenizedGenerateReqInput:
533
533
  dp_balance_id: int = -1
534
534
 
535
535
 
536
+ @dataclass
537
+ class BatchTokenizedGenerateReqInput:
538
+ # The batch of tokenized requests
539
+ batch: List[TokenizedGenerateReqInput]
540
+
541
+ def __len__(self):
542
+ return len(self.batch)
543
+
544
+ def __getitem__(self, i):
545
+ return self.batch[i]
546
+
547
+ def __iter__(self):
548
+ return iter(self.batch)
549
+
550
+
536
551
  @dataclass
537
552
  class EmbeddingReqInput:
538
553
  # The input prompt. It can be a single prompt or a batch of prompts.
@@ -668,6 +683,21 @@ class TokenizedEmbeddingReqInput:
668
683
  dp_balance_id: int = -1
669
684
 
670
685
 
686
+ @dataclass
687
+ class BatchTokenizedEmbeddingReqInput:
688
+ # The batch of tokenized embedding requests
689
+ batch: List[TokenizedEmbeddingReqInput]
690
+
691
+ def __len__(self):
692
+ return len(self.batch)
693
+
694
+ def __getitem__(self, i):
695
+ return self.batch[i]
696
+
697
+ def __iter__(self):
698
+ return iter(self.batch)
699
+
700
+
671
701
  @dataclass
672
702
  class BatchTokenIDOut:
673
703
  # The request id
@@ -784,6 +814,16 @@ class BatchEmbeddingOut:
784
814
  cached_tokens: List[int]
785
815
 
786
816
 
817
+ @dataclass
818
+ class ClearHiCacheReqInput:
819
+ pass
820
+
821
+
822
+ @dataclass
823
+ class ClearHiCacheReqOutput:
824
+ success: bool
825
+
826
+
787
827
  @dataclass
788
828
  class FlushCacheReqInput:
789
829
  pass
@@ -943,6 +983,11 @@ class AbortReq:
943
983
  abort_all: bool = False
944
984
  # The finished reason data
945
985
  finished_reason: Optional[Dict[str, Any]] = None
986
+ # used in MultiTokenzierManager mode
987
+ rids: Optional[Union[List[str], str]] = None
988
+
989
+ def __post_init__(self):
990
+ self.rids = self.rid
946
991
 
947
992
 
948
993
  @dataclass
@@ -1143,6 +1188,18 @@ class LoRAUpdateResult:
1143
1188
  LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateResult
1144
1189
 
1145
1190
 
1191
+ @dataclass
1192
+ class MultiTokenizerRegisterReq:
1193
+ rids: Optional[Union[List[str], str]] = None
1194
+ ipc_name: Optional[str] = None
1195
+
1196
+
1197
+ @dataclass
1198
+ class MultiTokenizerWarpper:
1199
+ worker_id: int
1200
+ obj: Optional[Any] = None
1201
+
1202
+
1146
1203
  class BlockReqType(Enum):
1147
1204
  BLOCK = 1
1148
1205
  UNBLOCK = 2
@@ -20,9 +20,11 @@ from sglang.srt.managers.schedule_batch import (
20
20
  )
21
21
  from sglang.srt.mem_cache.multimodal_cache import MultiModalCache
22
22
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
23
- from sglang.srt.utils import flatten_nested_list, print_warning_once
23
+ from sglang.srt.utils import flatten_nested_list, is_npu, print_warning_once
24
24
  from sglang.utils import logger
25
25
 
26
+ _is_npu = is_npu()
27
+
26
28
  # NOTE: Using the shared logger from sglang.utils instead of creating a module-specific logger
27
29
  # to ensure consistent logging behavior across the codebase. This prevents issues with log
28
30
  # propagation that can cause some log messages (like 'server is fired up') to not appear
@@ -486,6 +488,8 @@ def get_embedding_and_mask(
486
488
  if embedding is None:
487
489
  return None, None
488
490
  # 2. Get mask
491
+ if _is_npu:
492
+ torch.npu.current_stream().synchronize()
489
493
  special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
490
494
  # 3. Adjust embedding length if needed
491
495
  embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)