sglang 0.4.3.post4__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. sglang/bench_serving.py +1 -1
  2. sglang/lang/chat_template.py +29 -0
  3. sglang/srt/_custom_ops.py +19 -17
  4. sglang/srt/configs/__init__.py +2 -0
  5. sglang/srt/configs/janus_pro.py +629 -0
  6. sglang/srt/configs/model_config.py +24 -14
  7. sglang/srt/conversation.py +80 -2
  8. sglang/srt/custom_op.py +64 -3
  9. sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
  10. sglang/srt/distributed/parallel_state.py +10 -1
  11. sglang/srt/entrypoints/engine.py +5 -3
  12. sglang/srt/entrypoints/http_server.py +1 -1
  13. sglang/srt/hf_transformers_utils.py +16 -1
  14. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  15. sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
  16. sglang/srt/layers/attention/triton_backend.py +1 -3
  17. sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
  18. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
  19. sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
  20. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
  21. sglang/srt/layers/attention/vision.py +43 -62
  22. sglang/srt/layers/linear.py +1 -1
  23. sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
  24. sglang/srt/layers/moe/ep_moe/layer.py +25 -9
  25. sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
  32. sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
  33. sglang/srt/layers/parameter.py +10 -0
  34. sglang/srt/layers/quantization/__init__.py +90 -68
  35. sglang/srt/layers/quantization/blockwise_int8.py +1 -2
  36. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  37. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  39. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  40. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  41. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  46. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  48. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  49. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  50. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  51. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/quantization/fp8.py +174 -106
  63. sglang/srt/layers/quantization/fp8_kernel.py +210 -38
  64. sglang/srt/layers/quantization/fp8_utils.py +156 -15
  65. sglang/srt/layers/quantization/modelopt_quant.py +5 -1
  66. sglang/srt/layers/quantization/w8a8_fp8.py +128 -0
  67. sglang/srt/layers/quantization/w8a8_int8.py +152 -3
  68. sglang/srt/layers/rotary_embedding.py +5 -3
  69. sglang/srt/layers/sampler.py +29 -35
  70. sglang/srt/layers/vocab_parallel_embedding.py +0 -1
  71. sglang/srt/lora/backend/__init__.py +9 -12
  72. sglang/srt/managers/cache_controller.py +72 -8
  73. sglang/srt/managers/image_processor.py +37 -631
  74. sglang/srt/managers/image_processors/base_image_processor.py +219 -0
  75. sglang/srt/managers/image_processors/janus_pro.py +79 -0
  76. sglang/srt/managers/image_processors/llava.py +152 -0
  77. sglang/srt/managers/image_processors/minicpmv.py +86 -0
  78. sglang/srt/managers/image_processors/mlama.py +60 -0
  79. sglang/srt/managers/image_processors/qwen_vl.py +161 -0
  80. sglang/srt/managers/io_struct.py +32 -15
  81. sglang/srt/managers/multi_modality_padding.py +134 -0
  82. sglang/srt/managers/schedule_batch.py +212 -117
  83. sglang/srt/managers/schedule_policy.py +40 -8
  84. sglang/srt/managers/scheduler.py +124 -665
  85. sglang/srt/managers/scheduler_output_processor_mixin.py +611 -0
  86. sglang/srt/managers/tokenizer_manager.py +6 -6
  87. sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
  88. sglang/srt/mem_cache/base_prefix_cache.py +6 -8
  89. sglang/srt/mem_cache/chunk_cache.py +12 -44
  90. sglang/srt/mem_cache/hiradix_cache.py +63 -34
  91. sglang/srt/mem_cache/memory_pool.py +78 -17
  92. sglang/srt/mem_cache/paged_allocator.py +283 -0
  93. sglang/srt/mem_cache/radix_cache.py +117 -36
  94. sglang/srt/model_executor/cuda_graph_runner.py +9 -4
  95. sglang/srt/model_executor/forward_batch_info.py +12 -8
  96. sglang/srt/model_executor/model_runner.py +63 -63
  97. sglang/srt/model_loader/loader.py +2 -1
  98. sglang/srt/model_loader/weight_utils.py +1 -1
  99. sglang/srt/models/deepseek_janus_pro.py +2127 -0
  100. sglang/srt/models/deepseek_nextn.py +23 -3
  101. sglang/srt/models/deepseek_v2.py +25 -19
  102. sglang/srt/models/minicpmv.py +28 -89
  103. sglang/srt/models/mllama.py +1 -1
  104. sglang/srt/models/qwen2.py +0 -1
  105. sglang/srt/models/qwen2_5_vl.py +25 -50
  106. sglang/srt/models/qwen2_vl.py +33 -49
  107. sglang/srt/openai_api/adapter.py +37 -15
  108. sglang/srt/openai_api/protocol.py +8 -1
  109. sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
  110. sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
  111. sglang/srt/server_args.py +19 -11
  112. sglang/srt/speculative/eagle_worker.py +75 -39
  113. sglang/srt/utils.py +104 -9
  114. sglang/test/runners.py +104 -10
  115. sglang/test/test_block_fp8.py +106 -16
  116. sglang/test/test_custom_ops.py +88 -0
  117. sglang/test/test_utils.py +20 -4
  118. sglang/utils.py +0 -4
  119. sglang/version.py +1 -1
  120. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/METADATA +9 -10
  121. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/RECORD +124 -79
  122. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/WHEEL +1 -1
  123. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/LICENSE +0 -0
  124. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/top_level.txt +0 -0
@@ -73,16 +73,29 @@ class CacheAgnosticPolicy(Enum):
73
73
  class SchedulePolicy:
74
74
  Policy = Union[CacheAwarePolicy, CacheAgnosticPolicy]
75
75
 
76
- def __init__(self, policy: str, tree_cache: BasePrefixCache):
76
+ def __init__(
77
+ self,
78
+ policy: str,
79
+ tree_cache: BasePrefixCache,
80
+ enable_hierarchical_cache: bool,
81
+ ):
77
82
  self.policy = self._validate_and_adjust_policy(policy, tree_cache)
78
83
  self.tree_cache = tree_cache
84
+ self.enable_hierarchical_cache = enable_hierarchical_cache
79
85
 
80
86
  # It is used to find the matching prefix for in-batch prefix caching.
81
87
  self.waiting_queue_radix_tree = RadixCache(
82
- req_to_token_pool=None, token_to_kv_pool_allocator=None, disable=False
88
+ req_to_token_pool=None,
89
+ token_to_kv_pool_allocator=None,
90
+ page_size=1,
91
+ disable=False,
83
92
  )
84
93
 
85
94
  def calc_priority(self, waiting_queue: List[Req]) -> bool:
95
+ if self.policy == CacheAgnosticPolicy.FCFS:
96
+ # A shortcut for FCFS
97
+ return
98
+
86
99
  policy = self._determine_active_policy(waiting_queue)
87
100
 
88
101
  prefix_computed = False
@@ -112,7 +125,7 @@ class SchedulePolicy:
112
125
  return prefix_computed
113
126
 
114
127
  def _determine_active_policy(self, waiting_queue: List[Req]) -> Policy:
115
- if len(waiting_queue) > 128 and self.policy == CacheAwarePolicy.LPM:
128
+ if self.policy == CacheAwarePolicy.LPM and len(waiting_queue) > 128:
116
129
  # Turn off the expensive prefix matching and sorting when the #queue is large.
117
130
  return CacheAgnosticPolicy.FCFS
118
131
  return self.policy
@@ -149,9 +162,14 @@ class SchedulePolicy:
149
162
  prefix_ids = r.adjust_max_prefix_ids()
150
163
 
151
164
  # NOTE: the prefix_indices must always be aligned with last_node
152
- r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
153
- rid=r.rid, key=prefix_ids
154
- )
165
+ if self.enable_hierarchical_cache:
166
+ r.prefix_indices, r.last_node, r.last_node_global = (
167
+ self.tree_cache.match_prefix(key=prefix_ids, include_evicted=True)
168
+ )
169
+ else:
170
+ r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
171
+ rid=r.rid, key=prefix_ids
172
+ )
155
173
 
156
174
  # NOTE(sang): This logic is for in-batch prefix caching;
157
175
  # If there are more than 1 request that have small matching prefix from
@@ -428,8 +446,10 @@ class PrefillAdder:
428
446
 
429
447
  return self.budget_state()
430
448
 
431
- def add_one_req(self, req: Req, has_chunked_req: bool):
432
- if req.sampling_params.ignore_eos and self.tree_cache.disable:
449
+ def add_one_req(
450
+ self, req: Req, has_chunked_req: bool, enable_hierarchical_cache: bool = False
451
+ ):
452
+ if req.sampling_params.ignore_eos and getattr(self.tree_cache, "disable", True):
433
453
  return self.add_one_req_ignore_eos(req, has_chunked_req)
434
454
 
435
455
  total_tokens = req.extend_input_len + min(
@@ -448,6 +468,18 @@ class PrefillAdder:
448
468
  if total_tokens > self.rem_total_tokens:
449
469
  return AddReqResult.NO_TOKEN
450
470
 
471
+ if (
472
+ enable_hierarchical_cache
473
+ and req.last_node_global is not None
474
+ and req.last_node_global.evicted
475
+ ):
476
+ req.last_node, req.prefix_indices = self.tree_cache.init_load_back(
477
+ req.last_node_global, req.prefix_indices
478
+ )
479
+ req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
480
+ input_tokens = req.extend_input_len
481
+ prefix_len = len(req.prefix_indices)
482
+
451
483
  if self.rem_chunk_tokens is None or input_tokens <= self.rem_chunk_tokens:
452
484
  # Non-chunked prefill
453
485
  self.can_run_list.append(req)