sglang 0.4.2.post3__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. sglang/check_env.py +1 -0
  2. sglang/global_config.py +2 -0
  3. sglang/srt/constrained/outlines_backend.py +4 -1
  4. sglang/srt/entrypoints/engine.py +2 -2
  5. sglang/srt/layers/attention/flashinfer_backend.py +265 -147
  6. sglang/srt/layers/attention/triton_backend.py +358 -72
  7. sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
  8. sglang/srt/layers/linear.py +12 -5
  9. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +2 -2
  10. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  11. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +2 -2
  12. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  13. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  14. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  15. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +178 -0
  16. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +175 -0
  18. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -5
  19. sglang/srt/layers/moe/fused_moe_triton/layer.py +2 -0
  20. sglang/srt/layers/moe/topk.py +1 -1
  21. sglang/srt/layers/quantization/__init__.py +51 -5
  22. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  23. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  24. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +30 -30
  25. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  26. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  27. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  31. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +29 -29
  32. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  33. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  34. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +33 -33
  35. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  36. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +31 -31
  37. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  38. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +27 -27
  39. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  40. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +31 -31
  41. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  46. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  48. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +24 -24
  49. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  50. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +30 -30
  51. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  52. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +42 -42
  53. sglang/srt/layers/quantization/fp8_kernel.py +123 -17
  54. sglang/srt/layers/quantization/fp8_utils.py +33 -4
  55. sglang/srt/lora/backend/__init__.py +25 -5
  56. sglang/srt/lora/backend/base_backend.py +31 -9
  57. sglang/srt/lora/backend/flashinfer_backend.py +41 -4
  58. sglang/srt/lora/backend/triton_backend.py +34 -4
  59. sglang/srt/lora/layers.py +293 -0
  60. sglang/srt/lora/lora.py +101 -326
  61. sglang/srt/lora/lora_manager.py +101 -269
  62. sglang/srt/lora/mem_pool.py +174 -0
  63. sglang/srt/lora/triton_ops/__init__.py +7 -1
  64. sglang/srt/lora/triton_ops/gate_up_lora_b.py +170 -0
  65. sglang/srt/lora/triton_ops/qkv_lora_b.py +5 -5
  66. sglang/srt/lora/triton_ops/sgemm_lora_a.py +2 -2
  67. sglang/srt/lora/triton_ops/sgemm_lora_b.py +2 -2
  68. sglang/srt/lora/utils.py +141 -0
  69. sglang/srt/managers/detokenizer_manager.py +1 -0
  70. sglang/srt/managers/io_struct.py +4 -0
  71. sglang/srt/managers/schedule_batch.py +16 -3
  72. sglang/srt/managers/scheduler.py +29 -0
  73. sglang/srt/managers/tokenizer_manager.py +6 -0
  74. sglang/srt/managers/tp_worker_overlap_thread.py +4 -0
  75. sglang/srt/model_executor/cuda_graph_runner.py +16 -1
  76. sglang/srt/model_executor/model_runner.py +12 -2
  77. sglang/srt/models/deepseek_v2.py +17 -7
  78. sglang/srt/server_args.py +20 -1
  79. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
  80. sglang/srt/speculative/eagle_utils.py +64 -21
  81. sglang/srt/speculative/eagle_worker.py +29 -8
  82. sglang/srt/utils.py +7 -0
  83. sglang/version.py +1 -1
  84. {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/METADATA +6 -5
  85. {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/RECORD +88 -55
  86. {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/LICENSE +0 -0
  87. {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/WHEEL +0 -0
  88. {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/top_level.txt +0 -0
@@ -255,6 +255,8 @@ class DeepseekV2Attention(nn.Module):
255
255
  self.kv_lora_rank + self.qk_rope_head_dim,
256
256
  bias=False,
257
257
  quant_config=quant_config,
258
+ # FIXME: quick fix for skip quantization
259
+ prefix=f"self_attn.kv_a_proj_with_mqa",
258
260
  )
259
261
  self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
260
262
  self.kv_b_proj = ColumnParallelLinear(
@@ -455,6 +457,8 @@ class DeepseekV2AttentionMLA(nn.Module):
455
457
  self.kv_lora_rank + self.qk_rope_head_dim,
456
458
  bias=False,
457
459
  quant_config=quant_config,
460
+ # FIXME: quick fix for skip quantization
461
+ prefix=f"self_attn.kv_a_proj_with_mqa",
458
462
  )
459
463
  self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
460
464
 
@@ -506,14 +510,20 @@ class DeepseekV2AttentionMLA(nn.Module):
506
510
  hidden_states: torch.Tensor,
507
511
  forward_batch: ForwardBatch,
508
512
  ) -> torch.Tensor:
509
- # Use normal computation for prefill and use weight absorption for extend/decode
510
- if (
511
- forward_batch.forward_mode.is_extend()
512
- and forward_batch.extend_prefix_lens.sum() == 0
513
- ):
514
- return self.forward_normal(positions, hidden_states, forward_batch)
513
+ if global_server_args_dict["enable_flashinfer_mla"]:
514
+ if forward_batch.forward_mode.is_extend():
515
+ return self.forward_normal(positions, hidden_states, forward_batch)
516
+ else:
517
+ return self.forward_absorb(positions, hidden_states, forward_batch)
515
518
  else:
516
- return self.forward_absorb(positions, hidden_states, forward_batch)
519
+ # Triton: Use normal computation for prefill and use weight absorption for extend/decode
520
+ if (
521
+ forward_batch.forward_mode.is_extend()
522
+ and forward_batch.extend_prefix_lens.sum() == 0
523
+ ):
524
+ return self.forward_normal(positions, hidden_states, forward_batch)
525
+ else:
526
+ return self.forward_absorb(positions, hidden_states, forward_batch)
517
527
 
518
528
  def forward_normal(
519
529
  self,
sglang/srt/server_args.py CHANGED
@@ -140,6 +140,7 @@ class ServerArgs:
140
140
  disable_jump_forward: bool = False
141
141
  disable_cuda_graph: bool = False
142
142
  disable_cuda_graph_padding: bool = False
143
+ enable_nccl_nvls: bool = False
143
144
  disable_outlines_disk_cache: bool = False
144
145
  disable_custom_all_reduce: bool = False
145
146
  disable_mla: bool = False
@@ -160,12 +161,15 @@ class ServerArgs:
160
161
  delete_ckpt_after_loading: bool = False
161
162
  enable_memory_saver: bool = False
162
163
  allow_auto_truncate: bool = False
164
+ return_hidden_states: bool = False
163
165
 
164
166
  # Custom logit processor
165
167
  enable_custom_logit_processor: bool = False
166
168
  tool_call_parser: str = None
167
169
  enable_hierarchical_cache: bool = False
168
170
 
171
+ enable_flashinfer_mla: bool = False
172
+
169
173
  def __post_init__(self):
170
174
  # Set missing default values
171
175
  if self.tokenizer_path is None:
@@ -691,6 +695,11 @@ class ServerArgs:
691
695
  default=ServerArgs.grammar_backend,
692
696
  help="Choose the backend for grammar-guided decoding.",
693
697
  )
698
+ parser.add_argument(
699
+ "--enable-flashinfer-mla",
700
+ action="store_true",
701
+ help="Enable FlashInfer MLA optimization",
702
+ )
694
703
 
695
704
  # Speculative decoding
696
705
  parser.add_argument(
@@ -782,6 +791,11 @@ class ServerArgs:
782
791
  action="store_true",
783
792
  help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
784
793
  )
794
+ parser.add_argument(
795
+ "--enable-nccl-nvls",
796
+ action="store_true",
797
+ help="Enable NCCL NVLS for prefill heavy requests when available.",
798
+ )
785
799
  parser.add_argument(
786
800
  "--disable-outlines-disk-cache",
787
801
  action="store_true",
@@ -795,7 +809,7 @@ class ServerArgs:
795
809
  parser.add_argument(
796
810
  "--disable-mla",
797
811
  action="store_true",
798
- help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
812
+ help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
799
813
  )
800
814
  parser.add_argument(
801
815
  "--disable-overlap-schedule",
@@ -896,6 +910,11 @@ class ServerArgs:
896
910
  action="store_true",
897
911
  help="Enable users to pass custom logit processors to the server (disabled by default for security)",
898
912
  )
913
+ parser.add_argument(
914
+ "--return-hidden-states",
915
+ action="store_true",
916
+ help="Return hidden states in the response.",
917
+ )
899
918
  # Function Calling
900
919
  parser.add_argument(
901
920
  "--tool-call-parser",
@@ -85,6 +85,7 @@ class EAGLEDraftCudaGraphRunner:
85
85
  "1. disable cuda graph by --disable-cuda-graph\n"
86
86
  "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
87
87
  "3. disable torch compile by not using --enable-torch-compile\n"
88
+ "4. specify --dtype to the same dtype (e.g. bfloat16)\n"
88
89
  "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
89
90
  )
90
91
 
@@ -4,6 +4,7 @@ import dataclasses
4
4
  from typing import TYPE_CHECKING, List
5
5
 
6
6
  import torch
7
+ import torch.nn.functional as F
7
8
  import triton
8
9
  import triton.language as tl
9
10
 
@@ -11,7 +12,14 @@ from sglang.srt.layers.attention.flashinfer_backend import (
11
12
  create_flashinfer_kv_indices_triton,
12
13
  )
13
14
  from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
14
- from sglang.srt.speculative.build_eagle_tree import build_tree_kernel
15
+ from sglang.srt.speculative.build_eagle_tree import (
16
+ build_tree_kernel,
17
+ build_tree_kernel_efficient,
18
+ )
19
+ from sglang.srt.utils import is_cuda_available
20
+
21
+ if is_cuda_available():
22
+ from sgl_kernel import tree_speculative_sampling_target_only
15
23
 
16
24
  if TYPE_CHECKING:
17
25
  from sglang.srt.managers.schedule_batch import ScheduleBatch
@@ -160,8 +168,11 @@ class EagleVerifyInput:
160
168
  custom_mask: torch.Tensor
161
169
  positions: torch.Tensor
162
170
  retrive_index: torch.Tensor
171
+ retrive_next_token: torch.Tensor
172
+ retrive_next_sibling: torch.Tensor
163
173
  retrive_cum_len: torch.Tensor
164
174
  draft_token_num: int
175
+ spec_steps: int
165
176
  capture_hidden_mode: CaptureHiddenMode
166
177
 
167
178
  @classmethod
@@ -175,10 +186,45 @@ class EagleVerifyInput:
175
186
  seq_lens_sum: int,
176
187
  topk: int,
177
188
  spec_steps: int,
178
- num_verify_token: int,
189
+ num_verify_tokens: int,
190
+ is_all_greedy: bool,
179
191
  ):
180
- tree_mask, position, retrive_index, retrive_cum_len, draft_tokens = (
181
- build_tree_kernel(
192
+ if is_all_greedy:
193
+ tree_mask, position, retrive_index, retrive_cum_len, draft_tokens = (
194
+ build_tree_kernel(
195
+ verified_id,
196
+ score_list, # b, n, topk; n= 1 + (num_steps-1) * self.topk
197
+ token_list,
198
+ parents_list,
199
+ seq_lens,
200
+ seq_lens_sum,
201
+ topk,
202
+ spec_steps,
203
+ num_verify_tokens,
204
+ )
205
+ )
206
+
207
+ return cls(
208
+ draft_tokens,
209
+ tree_mask,
210
+ position,
211
+ retrive_index,
212
+ None,
213
+ None,
214
+ retrive_cum_len,
215
+ num_verify_tokens,
216
+ spec_steps,
217
+ CaptureHiddenMode.FULL,
218
+ )
219
+ else:
220
+ (
221
+ tree_mask,
222
+ position,
223
+ retrive_index,
224
+ retrive_next_token,
225
+ retrive_next_sibling,
226
+ draft_tokens,
227
+ ) = build_tree_kernel_efficient(
182
228
  verified_id,
183
229
  score_list,
184
230
  token_list,
@@ -187,18 +233,21 @@ class EagleVerifyInput:
187
233
  seq_lens_sum,
188
234
  topk,
189
235
  spec_steps,
190
- num_verify_token,
236
+ num_verify_tokens,
237
+ )
238
+
239
+ return cls(
240
+ draft_tokens,
241
+ tree_mask,
242
+ position,
243
+ retrive_index,
244
+ retrive_next_token,
245
+ retrive_next_sibling,
246
+ None,
247
+ num_verify_tokens,
248
+ spec_steps,
249
+ CaptureHiddenMode.FULL,
191
250
  )
192
- )
193
- return cls(
194
- draft_tokens,
195
- tree_mask,
196
- position,
197
- retrive_index,
198
- retrive_cum_len,
199
- num_verify_token,
200
- CaptureHiddenMode.FULL,
201
- )
202
251
 
203
252
  def prepare_for_verify(self, batch: ScheduleBatch):
204
253
  batch.input_ids = self.draft_token
@@ -313,12 +362,6 @@ class EagleVerifyInput:
313
362
  uniform_samples=coins,
314
363
  target_probs=target_probs,
315
364
  draft_probs=draft_probs,
316
- threshold_single=global_server_args_dict[
317
- "speculative_accept_threshold_single"
318
- ],
319
- threshold_acc=global_server_args_dict[
320
- "speculative_accept_threshold_acc"
321
- ],
322
365
  deterministic=True,
323
366
  )
324
367
 
@@ -65,15 +65,31 @@ class EAGLEWorker(TpModelWorker):
65
65
  self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
66
66
 
67
67
  # Create multi-step attn backends and cuda graph runners
68
- from sglang.srt.layers.attention.flashinfer_backend import (
69
- FlashInferMultiStepDraftBackend,
70
- )
68
+ if server_args.attention_backend == "flashinfer":
69
+ from sglang.srt.layers.attention.flashinfer_backend import (
70
+ FlashInferMultiStepDraftBackend,
71
+ )
72
+
73
+ self.draft_attn_backend = FlashInferMultiStepDraftBackend(
74
+ self.model_runner,
75
+ self.topk,
76
+ self.speculative_num_steps,
77
+ )
78
+ elif server_args.attention_backend == "triton":
79
+ from sglang.srt.layers.attention.triton_backend import (
80
+ TritonMultiStepDraftBackend,
81
+ )
82
+
83
+ self.draft_attn_backend = TritonMultiStepDraftBackend(
84
+ self.model_runner,
85
+ self.topk,
86
+ self.speculative_num_steps,
87
+ )
88
+ else:
89
+ raise ValueError(
90
+ f"EAGLE is not supportted in attention backend {server_args.attention_backend}"
91
+ )
71
92
 
72
- self.draft_attn_backend = FlashInferMultiStepDraftBackend(
73
- self.model_runner,
74
- self.topk,
75
- self.speculative_num_steps,
76
- )
77
93
  self.model_runner.draft_attn_backend = self.draft_attn_backend
78
94
  self.init_cuda_graphs()
79
95
 
@@ -185,6 +201,7 @@ class EAGLEWorker(TpModelWorker):
185
201
  self.topk,
186
202
  self.speculative_num_steps,
187
203
  self.server_args.speculative_num_draft_tokens,
204
+ batch.sampling_info.is_all_greedy,
188
205
  )
189
206
 
190
207
  # Free cache locations
@@ -217,6 +234,10 @@ class EAGLEWorker(TpModelWorker):
217
234
  token_list.append(tree_info[1])
218
235
  parents_list.append(tree_info[2])
219
236
 
237
+ # we don't need to run the last forward. we get 1 token from draft prefill and (#spec steps - 1) tokens here
238
+ if i == self.speculative_num_steps - 1:
239
+ break
240
+
220
241
  # Set inputs
221
242
  forward_batch.input_ids = input_ids
222
243
  forward_batch.out_cache_loc = out_cache_loc[
sglang/srt/utils.py CHANGED
@@ -1444,3 +1444,10 @@ def launch_dummy_health_check_server(host, port):
1444
1444
  timeout_keep_alive=5,
1445
1445
  loop="uvloop",
1446
1446
  )
1447
+
1448
+
1449
+ def set_cuda_arch():
1450
+ if is_flashinfer_available():
1451
+ capability = torch.cuda.get_device_capability()
1452
+ arch = f"{capability[0]}.{capability[1]}"
1453
+ os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.2.post3"
1
+ __version__ = "0.4.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.2.post3
3
+ Version: 0.4.3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -236,14 +236,15 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
236
236
  Requires-Dist: uvicorn; extra == "runtime-common"
237
237
  Requires-Dist: uvloop; extra == "runtime-common"
238
238
  Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
239
+ Requires-Dist: ninja; extra == "runtime-common"
239
240
  Provides-Extra: srt
240
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
241
242
  Requires-Dist: cuda-python; extra == "srt"
242
- Requires-Dist: sgl-kernel>=0.0.3.post2; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.3.post6; extra == "srt"
243
244
  Requires-Dist: torch; extra == "srt"
244
- Requires-Dist: vllm==0.6.4.post1; extra == "srt"
245
- Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"
246
- Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt"
245
+ Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
246
+ Requires-Dist: flashinfer_python>=0.2.1.post1; extra == "srt"
247
+ Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
247
248
  Provides-Extra: srt-hip
248
249
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
250
  Requires-Dist: torch; extra == "srt-hip"