sglang 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. sglang/bench_serving.py +2 -2
  2. sglang/srt/configs/model_config.py +36 -2
  3. sglang/srt/conversation.py +56 -3
  4. sglang/srt/disaggregation/ascend/__init__.py +6 -0
  5. sglang/srt/disaggregation/ascend/conn.py +44 -0
  6. sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
  7. sglang/srt/disaggregation/mooncake/conn.py +50 -18
  8. sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
  9. sglang/srt/disaggregation/utils.py +25 -3
  10. sglang/srt/entrypoints/engine.py +1 -1
  11. sglang/srt/entrypoints/http_server.py +1 -0
  12. sglang/srt/entrypoints/http_server_engine.py +1 -1
  13. sglang/srt/entrypoints/openai/protocol.py +11 -0
  14. sglang/srt/entrypoints/openai/serving_chat.py +7 -0
  15. sglang/srt/function_call/function_call_parser.py +2 -0
  16. sglang/srt/function_call/kimik2_detector.py +220 -0
  17. sglang/srt/hf_transformers_utils.py +18 -0
  18. sglang/srt/jinja_template_utils.py +8 -0
  19. sglang/srt/layers/communicator.py +20 -5
  20. sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
  21. sglang/srt/layers/layernorm.py +2 -2
  22. sglang/srt/layers/linear.py +12 -2
  23. sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
  24. sglang/srt/layers/moe/ep_moe/kernels.py +60 -1
  25. sglang/srt/layers/moe/ep_moe/layer.py +141 -2
  26. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
  27. sglang/srt/layers/moe/fused_moe_triton/layer.py +141 -59
  28. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
  29. sglang/srt/layers/moe/topk.py +8 -2
  30. sglang/srt/layers/parameter.py +19 -3
  31. sglang/srt/layers/quantization/__init__.py +2 -0
  32. sglang/srt/layers/quantization/fp8.py +28 -7
  33. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  34. sglang/srt/layers/quantization/modelopt_quant.py +244 -1
  35. sglang/srt/layers/quantization/moe_wna16.py +1 -2
  36. sglang/srt/layers/quantization/w4afp8.py +264 -0
  37. sglang/srt/layers/quantization/w8a8_int8.py +738 -14
  38. sglang/srt/layers/vocab_parallel_embedding.py +9 -3
  39. sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
  40. sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
  41. sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
  42. sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
  43. sglang/srt/managers/cache_controller.py +41 -195
  44. sglang/srt/managers/io_struct.py +35 -3
  45. sglang/srt/managers/mm_utils.py +59 -96
  46. sglang/srt/managers/schedule_batch.py +17 -6
  47. sglang/srt/managers/scheduler.py +38 -6
  48. sglang/srt/managers/tokenizer_manager.py +16 -0
  49. sglang/srt/mem_cache/hiradix_cache.py +2 -0
  50. sglang/srt/mem_cache/memory_pool.py +176 -101
  51. sglang/srt/mem_cache/memory_pool_host.py +6 -109
  52. sglang/srt/mem_cache/radix_cache.py +8 -4
  53. sglang/srt/model_executor/forward_batch_info.py +13 -1
  54. sglang/srt/model_loader/loader.py +23 -12
  55. sglang/srt/models/deepseek_janus_pro.py +1 -1
  56. sglang/srt/models/deepseek_v2.py +78 -19
  57. sglang/srt/models/deepseek_vl2.py +1 -1
  58. sglang/srt/models/gemma3_mm.py +1 -1
  59. sglang/srt/models/gemma3n_mm.py +6 -3
  60. sglang/srt/models/internvl.py +8 -2
  61. sglang/srt/models/kimi_vl.py +8 -2
  62. sglang/srt/models/llama.py +2 -0
  63. sglang/srt/models/llava.py +3 -1
  64. sglang/srt/models/llavavid.py +1 -1
  65. sglang/srt/models/minicpmo.py +1 -2
  66. sglang/srt/models/minicpmv.py +1 -1
  67. sglang/srt/models/mixtral_quant.py +4 -0
  68. sglang/srt/models/mllama4.py +372 -82
  69. sglang/srt/models/phi4mm.py +8 -2
  70. sglang/srt/models/phimoe.py +553 -0
  71. sglang/srt/models/qwen2.py +2 -0
  72. sglang/srt/models/qwen2_5_vl.py +10 -7
  73. sglang/srt/models/qwen2_vl.py +12 -1
  74. sglang/srt/models/vila.py +8 -2
  75. sglang/srt/multimodal/mm_utils.py +2 -2
  76. sglang/srt/multimodal/processors/base_processor.py +197 -137
  77. sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
  78. sglang/srt/multimodal/processors/gemma3.py +4 -2
  79. sglang/srt/multimodal/processors/gemma3n.py +1 -1
  80. sglang/srt/multimodal/processors/internvl.py +1 -1
  81. sglang/srt/multimodal/processors/janus_pro.py +1 -1
  82. sglang/srt/multimodal/processors/kimi_vl.py +1 -1
  83. sglang/srt/multimodal/processors/minicpm.py +4 -3
  84. sglang/srt/multimodal/processors/mllama4.py +63 -61
  85. sglang/srt/multimodal/processors/phi4mm.py +1 -1
  86. sglang/srt/multimodal/processors/pixtral.py +1 -1
  87. sglang/srt/multimodal/processors/qwen_vl.py +203 -80
  88. sglang/srt/multimodal/processors/vila.py +1 -1
  89. sglang/srt/server_args.py +26 -4
  90. sglang/srt/two_batch_overlap.py +3 -0
  91. sglang/srt/utils.py +191 -48
  92. sglang/test/test_cutlass_w4a8_moe.py +281 -0
  93. sglang/utils.py +5 -5
  94. sglang/version.py +1 -1
  95. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +6 -4
  96. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +99 -90
  97. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
  98. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
  99. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0
@@ -101,6 +101,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
101
101
  "triton_attention_reduce_in_fp32",
102
102
  "num_reserved_decode_tokens",
103
103
  "weight_loader_disable_mmap",
104
+ "enable_triton_kernel_moe",
104
105
  ]
105
106
 
106
107
  # Put some global args for easy access
@@ -184,6 +185,10 @@ class Modality(Enum):
184
185
  f"Invalid modality string: {modality_str}. Valid modalities are: {[m.name for m in Modality]}"
185
186
  )
186
187
 
188
+ @staticmethod
189
+ def all():
190
+ return [Modality.IMAGE, Modality.VIDEO, Modality.AUDIO]
191
+
187
192
 
188
193
  @dataclasses.dataclass
189
194
  class MultimodalDataItem:
@@ -199,7 +204,7 @@ class MultimodalDataItem:
199
204
  hash: int = None
200
205
  pad_value: int = None
201
206
  image_sizes: Tuple[int, int] = None
202
- image_offsets: Optional[list] = None
207
+ offsets: Optional[list] = None
203
208
 
204
209
  # the real data, pixel_values or audio_features
205
210
  # data: Union[List[torch.Tensor], List[np.ndarray]]
@@ -252,12 +257,17 @@ class MultimodalDataItem:
252
257
  self.hash = hash_feature(self.audio_features)
253
258
  elif self.input_features is not None:
254
259
  self.hash = hash_feature(self.input_features)
260
+ elif self.is_video():
261
+ self.hash = hash_feature(self.pixel_values_videos)
255
262
  else:
256
263
  self.hash = hash_feature(self.pixel_values)
257
264
 
258
265
  assert self.hash is not None
259
266
  self.pad_value = self.hash % (1 << 30)
260
267
 
268
+ def is_modality(self, modality: Modality) -> bool:
269
+ return self.modality == modality
270
+
261
271
  def is_audio(self):
262
272
  return (self.modality == Modality.AUDIO) and (
263
273
  self.precomputed_features is not None
@@ -267,7 +277,7 @@ class MultimodalDataItem:
267
277
 
268
278
  def is_image(self):
269
279
  return (
270
- self.modality == Modality.IMAGE or self.modality == Modality.MULTI_IMAGES
280
+ self.is_modality(Modality.IMAGE) or self.is_modality(Modality.MULTI_IMAGES)
271
281
  ) and (
272
282
  self.precomputed_features is not None
273
283
  or not MultimodalDataItem.is_empty_list(self.pixel_values)
@@ -276,7 +286,7 @@ class MultimodalDataItem:
276
286
  def is_video(self):
277
287
  return (self.modality == Modality.VIDEO) and (
278
288
  self.precomputed_features is not None
279
- or not MultimodalDataItem.is_empty_list(self.pixel_values)
289
+ or not MultimodalDataItem.is_empty_list(self.pixel_values_videos)
280
290
  )
281
291
 
282
292
  def is_valid(self) -> bool:
@@ -350,6 +360,7 @@ class MultimodalInputs:
350
360
  "im_token_id",
351
361
  "im_start_id",
352
362
  "im_end_id",
363
+ "video_token_id",
353
364
  "slice_start_id",
354
365
  "slice_end_id",
355
366
  "audio_start_id",
@@ -363,11 +374,12 @@ class MultimodalInputs:
363
374
  return ret
364
375
 
365
376
  def contains_image_inputs(self) -> bool:
366
- """ """
367
377
  return any(item.is_image() for item in self.mm_items)
368
378
 
379
+ def contains_video_inputs(self) -> bool:
380
+ return any(item.is_video() for item in self.mm_items)
381
+
369
382
  def contains_audio_inputs(self) -> bool:
370
- """ """
371
383
  return any(item.is_audio() for item in self.mm_items)
372
384
 
373
385
  def contains_mm_input(self) -> bool:
@@ -842,7 +854,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
842
854
  global_num_tokens_for_logprob: Optional[List[int]] = None
843
855
  is_extend_in_batch: bool = False
844
856
  can_run_dp_cuda_graph: bool = False
845
- is_extend_in_batch: bool = False
846
857
  tbo_split_seq_index: Optional[int] = None
847
858
  global_forward_mode: Optional[ForwardMode] = None
848
859
 
@@ -13,6 +13,7 @@
13
13
  # ==============================================================================
14
14
  """A scheduler that manages a tensor parallel GPU worker."""
15
15
 
16
+ import datetime
16
17
  import faulthandler
17
18
  import logging
18
19
  import os
@@ -484,6 +485,8 @@ class Scheduler(
484
485
  enable=server_args.enable_memory_saver
485
486
  )
486
487
  self.init_profier()
488
+
489
+ # Init metrics stats
487
490
  self.init_metrics()
488
491
  self.init_kv_events(server_args.kv_events_config)
489
492
 
@@ -590,6 +593,12 @@ class Scheduler(
590
593
  hicache_ratio=server_args.hicache_ratio,
591
594
  hicache_size=server_args.hicache_size,
592
595
  hicache_write_policy=server_args.hicache_write_policy,
596
+ hicache_io_backend=(
597
+ "direct"
598
+ if server_args.attention_backend
599
+ == "fa3" # hot fix for incompatibility
600
+ else server_args.hicache_io_backend
601
+ ),
593
602
  )
594
603
  self.tp_worker.register_hicache_layer_transfer_counter(
595
604
  self.tree_cache.cache_controller.layer_done_counter
@@ -621,6 +630,7 @@ class Scheduler(
621
630
  self.torch_profiler_output_dir: Optional[str] = None
622
631
  self.profiler_activities: Optional[List[str]] = None
623
632
  self.profile_id: Optional[str] = None
633
+ self.profiler_start_forward_ct: Optional[int] = None
624
634
  self.profiler_target_forward_ct: Optional[int] = None
625
635
  self.profiler_target_prefill_ct: Optional[int] = None
626
636
  self.profiler_target_decode_ct: Optional[int] = None
@@ -1313,10 +1323,12 @@ class Scheduler(
1313
1323
  f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
1314
1324
  f += f"#queue-req: {len(self.waiting_queue)}, "
1315
1325
  f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
1316
- f += f"input throughput (token/s): {self.last_input_throughput:.2f} "
1326
+ f += f"input throughput (token/s): {self.last_input_throughput:.2f}, "
1317
1327
  else:
1318
1328
  f += f"#running-req: {running_bs}, "
1319
- f += f"#queue-req: {len(self.waiting_queue)}"
1329
+ f += f"#queue-req: {len(self.waiting_queue)}, "
1330
+
1331
+ f += f"timestamp: {datetime.datetime.now().isoformat()}"
1320
1332
 
1321
1333
  logger.info(f)
1322
1334
 
@@ -1378,7 +1390,8 @@ class Scheduler(
1378
1390
  msg += (
1379
1391
  f"cuda graph: {can_run_cuda_graph}, "
1380
1392
  f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
1381
- f"#queue-req: {len(self.waiting_queue)}"
1393
+ f"#queue-req: {len(self.waiting_queue)}, "
1394
+ f"timestamp: {datetime.datetime.now().isoformat()}"
1382
1395
  )
1383
1396
 
1384
1397
  logger.info(msg)
@@ -2333,9 +2346,8 @@ class Scheduler(
2333
2346
 
2334
2347
  def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput):
2335
2348
  tags = recv_req.tags
2336
- import subprocess
2337
2349
 
2338
- if tags is None:
2350
+ if tags is None or len(tags) == 0:
2339
2351
  tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
2340
2352
 
2341
2353
  if GPU_MEMORY_TYPE_KV_CACHE in tags:
@@ -2346,17 +2358,20 @@ class Scheduler(
2346
2358
  self.stashed_model_static_state = _export_static_state(
2347
2359
  self.tp_worker.worker.model_runner.model
2348
2360
  )
2361
+ torch.distributed.barrier(self.tp_cpu_group)
2349
2362
  self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)
2350
2363
 
2351
2364
  return ReleaseMemoryOccupationReqOutput()
2352
2365
 
2353
2366
  def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput):
2354
2367
  tags = recv_req.tags
2368
+
2355
2369
  if tags is None or len(tags) == 0:
2356
2370
  tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
2357
2371
 
2358
2372
  if GPU_MEMORY_TYPE_WEIGHTS in tags:
2359
2373
  self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_WEIGHTS)
2374
+ torch.distributed.barrier(self.tp_cpu_group)
2360
2375
  _import_static_state(
2361
2376
  self.tp_worker.worker.model_runner.model,
2362
2377
  self.stashed_model_static_state,
@@ -2377,9 +2392,10 @@ class Scheduler(
2377
2392
 
2378
2393
  def profile(self, recv_req: ProfileReq):
2379
2394
  if recv_req.type == ProfileReqType.START_PROFILE:
2380
- if recv_req.profile_by_stage:
2395
+ if recv_req.profile_by_stage or recv_req.start_step:
2381
2396
  return self.init_profile(
2382
2397
  recv_req.output_dir,
2398
+ recv_req.start_step,
2383
2399
  recv_req.num_steps,
2384
2400
  recv_req.activities,
2385
2401
  recv_req.with_stack,
@@ -2390,6 +2406,7 @@ class Scheduler(
2390
2406
  else:
2391
2407
  self.init_profile(
2392
2408
  recv_req.output_dir,
2409
+ recv_req.start_step,
2393
2410
  recv_req.num_steps,
2394
2411
  recv_req.activities,
2395
2412
  recv_req.with_stack,
@@ -2404,6 +2421,7 @@ class Scheduler(
2404
2421
  def init_profile(
2405
2422
  self,
2406
2423
  output_dir: Optional[str],
2424
+ start_step: Optional[int],
2407
2425
  num_steps: Optional[int],
2408
2426
  activities: Optional[List[str]],
2409
2427
  with_stack: Optional[bool],
@@ -2430,6 +2448,9 @@ class Scheduler(
2430
2448
  self.profiler_activities = activities
2431
2449
  self.profile_id = profile_id
2432
2450
 
2451
+ if start_step:
2452
+ self.profiler_start_forward_ct = max(start_step, self.forward_ct + 1)
2453
+
2433
2454
  if num_steps:
2434
2455
  self.profile_steps = num_steps
2435
2456
  if self.profile_by_stage:
@@ -2437,6 +2458,10 @@ class Scheduler(
2437
2458
  self.profiler_target_decode_ct = num_steps
2438
2459
  self.profiler_prefill_ct = 0
2439
2460
  self.profiler_decode_ct = 0
2461
+ elif start_step:
2462
+ self.profiler_target_forward_ct = (
2463
+ self.profiler_start_forward_ct + num_steps
2464
+ )
2440
2465
  else:
2441
2466
  self.profiler_target_forward_ct = self.forward_ct + num_steps
2442
2467
  # The caller will be notified when reaching profiler_target_forward_ct
@@ -2509,6 +2534,7 @@ class Scheduler(
2509
2534
 
2510
2535
  if "CUDA_PROFILER" in activities:
2511
2536
  torch.cuda.cudart().cudaProfilerStart()
2537
+ self.profile_in_progress = True
2512
2538
 
2513
2539
  return ProfileReqOutput(success=True, message="Succeeded")
2514
2540
 
@@ -2572,6 +2598,7 @@ class Scheduler(
2572
2598
  )
2573
2599
  self.torch_profiler = None
2574
2600
  self.profile_in_progress = False
2601
+ self.profiler_start_forward_ct = None
2575
2602
 
2576
2603
  return ProfileReqOutput(success=True, message="Succeeded.")
2577
2604
 
@@ -2605,6 +2632,11 @@ class Scheduler(
2605
2632
  and self.profiler_target_forward_ct <= self.forward_ct
2606
2633
  ):
2607
2634
  self.stop_profile()
2635
+ if (
2636
+ self.profiler_start_forward_ct
2637
+ and self.profiler_start_forward_ct == self.forward_ct
2638
+ ):
2639
+ self.start_profile()
2608
2640
 
2609
2641
  def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
2610
2642
  if recv_req == ExpertDistributionReq.START_RECORD:
@@ -285,6 +285,20 @@ class TokenizerManager:
285
285
  self.bootstrap_server = kv_bootstrap_server_class(
286
286
  self.server_args.disaggregation_bootstrap_port
287
287
  )
288
+ is_create_store = (
289
+ self.server_args.node_rank == 0
290
+ and self.server_args.disaggregation_transfer_backend == "ascend"
291
+ )
292
+ if is_create_store:
293
+ try:
294
+ from mf_adapter import create_config_store
295
+
296
+ ascend_url = os.getenv("ASCEND_MF_STORE_URL")
297
+ create_config_store(ascend_url)
298
+ except Exception as e:
299
+ error_message = f"Failed create mf store, invalid ascend_url."
300
+ error_message += f" With exception {e}"
301
+ raise error_message
288
302
 
289
303
  # For load balancing
290
304
  self.current_load = 0
@@ -863,6 +877,7 @@ class TokenizerManager:
863
877
  async def start_profile(
864
878
  self,
865
879
  output_dir: Optional[str] = None,
880
+ start_step: Optional[int] = None,
866
881
  num_steps: Optional[int] = None,
867
882
  activities: Optional[List[str]] = None,
868
883
  with_stack: Optional[bool] = None,
@@ -875,6 +890,7 @@ class TokenizerManager:
875
890
  req = ProfileReq(
876
891
  type=ProfileReqType.START_PROFILE,
877
892
  output_dir=output_dir,
893
+ start_step=start_step,
878
894
  num_steps=num_steps,
879
895
  activities=activities,
880
896
  with_stack=with_stack,
@@ -34,6 +34,7 @@ class HiRadixCache(RadixCache):
34
34
  hicache_ratio: float,
35
35
  hicache_size: int,
36
36
  hicache_write_policy: str,
37
+ hicache_io_backend: str,
37
38
  ):
38
39
  self.kv_cache = token_to_kv_pool_allocator.get_kvcache()
39
40
  if isinstance(self.kv_cache, MHATokenToKVPool):
@@ -56,6 +57,7 @@ class HiRadixCache(RadixCache):
56
57
  page_size,
57
58
  load_cache_event=self.load_cache_event,
58
59
  write_policy=hicache_write_policy,
60
+ io_backend=hicache_io_backend,
59
61
  )
60
62
 
61
63
  # record the nodes with ongoing write through