sglang 0.4.0__py3-none-any.whl → 0.4.0.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. sglang/__init__.py +1 -1
  2. sglang/bench_offline_throughput.py +18 -6
  3. sglang/bench_one_batch.py +13 -0
  4. sglang/bench_serving.py +8 -1
  5. sglang/check_env.py +140 -48
  6. sglang/lang/backend/runtime_endpoint.py +1 -0
  7. sglang/lang/chat_template.py +32 -0
  8. sglang/llama3_eval.py +316 -0
  9. sglang/srt/constrained/outlines_backend.py +5 -0
  10. sglang/srt/constrained/xgrammar_backend.py +9 -6
  11. sglang/srt/layers/attention/__init__.py +5 -2
  12. sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
  13. sglang/srt/layers/attention/flashinfer_backend.py +22 -5
  14. sglang/srt/layers/attention/torch_native_backend.py +22 -8
  15. sglang/srt/layers/attention/triton_backend.py +38 -33
  16. sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
  17. sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
  18. sglang/srt/layers/ep_moe/__init__.py +0 -0
  19. sglang/srt/layers/ep_moe/kernels.py +349 -0
  20. sglang/srt/layers/ep_moe/layer.py +665 -0
  21. sglang/srt/layers/fused_moe_triton/fused_moe.py +64 -21
  22. sglang/srt/layers/fused_moe_triton/layer.py +1 -1
  23. sglang/srt/layers/logits_processor.py +133 -95
  24. sglang/srt/layers/quantization/__init__.py +2 -47
  25. sglang/srt/layers/quantization/fp8.py +607 -0
  26. sglang/srt/layers/quantization/fp8_utils.py +27 -0
  27. sglang/srt/layers/radix_attention.py +11 -2
  28. sglang/srt/layers/sampler.py +29 -5
  29. sglang/srt/layers/torchao_utils.py +58 -45
  30. sglang/srt/managers/detokenizer_manager.py +37 -17
  31. sglang/srt/managers/io_struct.py +39 -10
  32. sglang/srt/managers/schedule_batch.py +39 -24
  33. sglang/srt/managers/schedule_policy.py +64 -5
  34. sglang/srt/managers/scheduler.py +236 -197
  35. sglang/srt/managers/tokenizer_manager.py +99 -58
  36. sglang/srt/managers/tp_worker_overlap_thread.py +7 -5
  37. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  38. sglang/srt/mem_cache/chunk_cache.py +2 -2
  39. sglang/srt/mem_cache/memory_pool.py +5 -1
  40. sglang/srt/mem_cache/radix_cache.py +12 -2
  41. sglang/srt/model_executor/cuda_graph_runner.py +39 -11
  42. sglang/srt/model_executor/model_runner.py +24 -9
  43. sglang/srt/model_parallel.py +67 -10
  44. sglang/srt/models/commandr.py +2 -2
  45. sglang/srt/models/deepseek_v2.py +87 -7
  46. sglang/srt/models/gemma2.py +34 -0
  47. sglang/srt/models/gemma2_reward.py +0 -1
  48. sglang/srt/models/granite.py +517 -0
  49. sglang/srt/models/grok.py +72 -13
  50. sglang/srt/models/llama.py +22 -5
  51. sglang/srt/models/llama_classification.py +11 -23
  52. sglang/srt/models/llama_reward.py +0 -2
  53. sglang/srt/models/llava.py +37 -14
  54. sglang/srt/models/mixtral.py +12 -9
  55. sglang/srt/models/phi3_small.py +0 -5
  56. sglang/srt/models/qwen2.py +20 -0
  57. sglang/srt/models/qwen2_moe.py +0 -5
  58. sglang/srt/models/torch_native_llama.py +0 -5
  59. sglang/srt/openai_api/adapter.py +4 -0
  60. sglang/srt/openai_api/protocol.py +9 -4
  61. sglang/srt/sampling/sampling_batch_info.py +9 -8
  62. sglang/srt/server.py +4 -4
  63. sglang/srt/server_args.py +62 -13
  64. sglang/srt/utils.py +57 -10
  65. sglang/test/test_utils.py +3 -2
  66. sglang/utils.py +10 -3
  67. sglang/version.py +1 -1
  68. {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/METADATA +15 -9
  69. {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/RECORD +72 -65
  70. {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/LICENSE +0 -0
  71. {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/WHEEL +0 -0
  72. {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/top_level.txt +0 -0
@@ -510,6 +510,8 @@ def v1_generate_request(
510
510
  "stop": request.stop,
511
511
  "stop_token_ids": request.stop_token_ids,
512
512
  "top_p": request.top_p,
513
+ "top_k": request.top_k,
514
+ "min_p": request.min_p,
513
515
  "presence_penalty": request.presence_penalty,
514
516
  "frequency_penalty": request.frequency_penalty,
515
517
  "repetition_penalty": request.repetition_penalty,
@@ -926,6 +928,8 @@ def v1_chat_generate_request(
926
928
  "stop": stop,
927
929
  "stop_token_ids": request.stop_token_ids,
928
930
  "top_p": request.top_p,
931
+ "top_k": request.top_k,
932
+ "min_p": request.min_p,
929
933
  "presence_penalty": request.presence_penalty,
930
934
  "frequency_penalty": request.frequency_penalty,
931
935
  "repetition_penalty": request.repetition_penalty,
@@ -166,17 +166,19 @@ class CompletionRequest(BaseModel):
166
166
  temperature: float = 1.0
167
167
  top_p: float = 1.0
168
168
  user: Optional[str] = None
169
- lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
170
169
 
171
170
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
172
- json_schema: Optional[str] = None
173
- regex: Optional[str] = None
171
+ top_k: int = -1
172
+ min_p: float = 0.0
174
173
  min_tokens: int = 0
174
+ regex: Optional[str] = None
175
+ json_schema: Optional[str] = None
175
176
  repetition_penalty: float = 1.0
176
177
  stop_token_ids: Optional[List[int]] = None
177
178
  no_stop_trim: bool = False
178
179
  ignore_eos: bool = False
179
180
  skip_special_tokens: bool = True
181
+ lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
180
182
 
181
183
 
182
184
  class CompletionResponseChoice(BaseModel):
@@ -276,13 +278,16 @@ class ChatCompletionRequest(BaseModel):
276
278
  user: Optional[str] = None
277
279
 
278
280
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
279
- regex: Optional[str] = None
281
+ top_k: int = -1
282
+ min_p: float = 0.0
280
283
  min_tokens: int = 0
284
+ regex: Optional[str] = None
281
285
  repetition_penalty: float = 1.0
282
286
  stop_token_ids: Optional[List[int]] = None
283
287
  no_stop_trim: bool = False
284
288
  ignore_eos: bool = False
285
289
  skip_special_tokens: bool = True
290
+ lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
286
291
 
287
292
 
288
293
  class ChatMessage(BaseModel):
@@ -158,22 +158,23 @@ class SamplingBatchInfo:
158
158
  return
159
159
 
160
160
  # find a grammar from the list
161
- grammar = next(grammar for grammar in self.grammars if grammar)
161
+ first_grammar = next(grammar for grammar in self.grammars if grammar)
162
162
 
163
163
  # maybe we can reuse the existing mask?
164
- self.vocab_mask = grammar.allocate_vocab_mask(
164
+ self.vocab_mask = first_grammar.allocate_vocab_mask(
165
165
  vocab_size=self.vocab_size,
166
166
  batch_size=len(self.temperatures),
167
167
  device=self.device,
168
168
  )
169
- self.apply_mask = type(grammar).apply_vocab_mask # force to use static method
169
+ self.apply_mask = first_grammar.apply_vocab_mask # force to use static method
170
170
 
171
+ # Apply the mask
171
172
  for i, grammar in enumerate(self.grammars):
172
- if grammar is not None:
173
- try:
174
- grammar.fill_vocab_mask(self.vocab_mask, i)
175
- except RuntimeError:
176
- continue
173
+ if grammar and not grammar.finished:
174
+ grammar.fill_vocab_mask(self.vocab_mask, i)
175
+
176
+ # Move the mask to the device if needed
177
+ self.vocab_mask = first_grammar.move_vocab_mask(self.vocab_mask, self.device)
177
178
 
178
179
  def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
179
180
  self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
sglang/srt/server.py CHANGED
@@ -196,7 +196,7 @@ async def stop_profile_async():
196
196
  @app.post("/update_weights_from_disk")
197
197
  @time_func_latency
198
198
  async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
199
- """Update the weights from disk inplace without re-launching the server."""
199
+ """Update the weights from disk in-place without re-launching the server."""
200
200
  success, message = await tokenizer_manager.update_weights_from_disk(obj, request)
201
201
  content = {"success": success, "message": message}
202
202
  if success:
@@ -329,7 +329,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request):
329
329
  )
330
330
 
331
331
 
332
- @app.api_route("/encode", methods=["POST", "PUT"])
332
+ @app.api_route("/classify", methods=["POST", "PUT"])
333
333
  @time_func_latency
334
334
  async def classify_request(obj: EmbeddingReqInput, request: Request):
335
335
  """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
@@ -462,8 +462,8 @@ def launch_engine(
462
462
  if server_args.node_rank >= 1:
463
463
  # For other nodes, they do not need to run tokenizer or detokenizer,
464
464
  # so they can just wait here.
465
- while True:
466
- pass
465
+ for proc in scheduler_procs:
466
+ proc.join()
467
467
  else:
468
468
  # Launch the data parallel controller
469
469
  reader, writer = mp.Pipe(duplex=False)
sglang/srt/server_args.py CHANGED
@@ -20,9 +20,12 @@ import random
20
20
  import tempfile
21
21
  from typing import List, Optional
22
22
 
23
+ import torch
24
+
23
25
  from sglang.srt.hf_transformers_utils import check_gguf_file
24
26
  from sglang.srt.utils import (
25
27
  get_amdgpu_memory_capacity,
28
+ get_hpu_memory_capacity,
26
29
  get_nvgpu_memory_capacity,
27
30
  is_flashinfer_available,
28
31
  is_hip,
@@ -91,6 +94,8 @@ class ServerArgs:
91
94
  # Data parallelism
92
95
  dp_size: int = 1
93
96
  load_balance_method: str = "round_robin"
97
+ # Expert parallelism
98
+ ep_size: int = 1
94
99
 
95
100
  # Multi-node distributed serving
96
101
  dist_init_addr: Optional[str] = None
@@ -128,6 +133,7 @@ class ServerArgs:
128
133
  disable_overlap_schedule: bool = False
129
134
  enable_mixed_chunk: bool = False
130
135
  enable_dp_attention: bool = False
136
+ enable_ep_moe: bool = False
131
137
  enable_torch_compile: bool = False
132
138
  torch_compile_max_bs: int = 32
133
139
  cuda_graph_max_bs: Optional[int] = None
@@ -135,6 +141,7 @@ class ServerArgs:
135
141
  enable_nan_detection: bool = False
136
142
  enable_p2p_check: bool = False
137
143
  triton_attention_reduce_in_fp32: bool = False
144
+ triton_attention_num_kv_splits: int = 8
138
145
  num_continuous_decode_steps: int = 1
139
146
  delete_ckpt_after_loading: bool = False
140
147
 
@@ -151,8 +158,13 @@ class ServerArgs:
151
158
 
152
159
  if is_hip():
153
160
  gpu_mem = get_amdgpu_memory_capacity()
154
- else:
161
+ elif torch.cuda.is_available():
155
162
  gpu_mem = get_nvgpu_memory_capacity()
163
+ elif self.device == "hpu":
164
+ gpu_mem = get_hpu_memory_capacity()
165
+ else:
166
+ # GPU memory is not known yet or no GPU is available.
167
+ gpu_mem = None
156
168
 
157
169
  # Set mem fraction static, which depends on the tensor parallelism size
158
170
  if self.mem_fraction_static is None:
@@ -169,19 +181,27 @@ class ServerArgs:
169
181
 
170
182
  # Set chunked prefill size, which depends on the gpu memory capacity
171
183
  if self.chunked_prefill_size is None:
172
- if gpu_mem < 25_000:
184
+ if gpu_mem is not None and gpu_mem < 25_000:
173
185
  self.chunked_prefill_size = 2048
174
186
  else:
175
187
  self.chunked_prefill_size = 8192
176
188
 
177
189
  # Set cuda graph max batch size
178
190
  if self.cuda_graph_max_bs is None:
179
- if gpu_mem < 25_000:
180
- self.cuda_graph_max_bs = 8
191
+ # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
192
+ if gpu_mem is not None and gpu_mem < 25_000:
193
+ if self.tp_size < 4:
194
+ self.cuda_graph_max_bs = 8
195
+ else:
196
+ self.cuda_graph_max_bs = 80
181
197
  else:
182
198
  self.cuda_graph_max_bs = 160
183
199
 
184
200
  # Choose kernel backends
201
+ if self.device == "hpu":
202
+ self.attention_backend = "torch_native"
203
+ self.sampling_backend = "pytorch"
204
+
185
205
  if self.attention_backend is None:
186
206
  self.attention_backend = (
187
207
  "flashinfer" if is_flashinfer_available() else "triton"
@@ -201,16 +221,20 @@ class ServerArgs:
201
221
  if self.enable_dp_attention:
202
222
  self.dp_size = self.tp_size
203
223
  self.chunked_prefill_size = self.chunked_prefill_size // 2
204
- self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
205
224
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
206
225
  self.disable_overlap_schedule = True
207
226
  logger.warning(
208
227
  f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
209
- f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
210
228
  f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
211
229
  "Data parallel size is adjusted to be the same as tensor parallel size. "
212
230
  "Overlap scheduler is disabled."
213
231
  )
232
+ # Expert parallelism
233
+ if self.enable_ep_moe:
234
+ self.ep_size = self.tp_size
235
+ logger.info(
236
+ f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
237
+ )
214
238
 
215
239
  # GGUF
216
240
  if (
@@ -257,7 +281,15 @@ class ServerArgs:
257
281
  "--load-format",
258
282
  type=str,
259
283
  default=ServerArgs.load_format,
260
- choices=["auto", "pt", "safetensors", "npcache", "dummy", "gguf"],
284
+ choices=[
285
+ "auto",
286
+ "pt",
287
+ "safetensors",
288
+ "npcache",
289
+ "dummy",
290
+ "gguf",
291
+ "bitsandbytes",
292
+ ],
261
293
  help="The format of the model weights to load. "
262
294
  '"auto" will try to load the weights in the safetensors format '
263
295
  "and fall back to the pytorch bin format if safetensors format "
@@ -268,7 +300,9 @@ class ServerArgs:
268
300
  "a numpy cache to speed up the loading. "
269
301
  '"dummy" will initialize the weights with random values, '
270
302
  "which is mainly for profiling."
271
- '"gguf" will load the weights in the gguf format. ',
303
+ '"gguf" will load the weights in the gguf format. '
304
+ '"bitsandbytes" will load the weights using bitsandbytes '
305
+ "quantization.",
272
306
  )
273
307
  parser.add_argument(
274
308
  "--trust-remote-code",
@@ -521,6 +555,14 @@ class ServerArgs:
521
555
  "shortest_queue",
522
556
  ],
523
557
  )
558
+ # Expert parallelism
559
+ parser.add_argument(
560
+ "--expert-parallel-size",
561
+ "--ep-size",
562
+ type=int,
563
+ default=ServerArgs.ep_size,
564
+ help="The expert parallelism size.",
565
+ )
524
566
 
525
567
  # Multi-node distributed serving
526
568
  parser.add_argument(
@@ -656,11 +698,6 @@ class ServerArgs:
656
698
  action="store_true",
657
699
  help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
658
700
  )
659
- parser.add_argument(
660
- "--disable-nan-detection",
661
- action="store_true",
662
- help="Disable the NaN detection for better performance.",
663
- )
664
701
  parser.add_argument(
665
702
  "--disable-overlap-schedule",
666
703
  action="store_true",
@@ -676,6 +713,11 @@ class ServerArgs:
676
713
  action="store_true",
677
714
  help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
678
715
  )
716
+ parser.add_argument(
717
+ "--enable-ep-moe",
718
+ action="store_true",
719
+ help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
720
+ )
679
721
  parser.add_argument(
680
722
  "--enable-torch-compile",
681
723
  action="store_true",
@@ -715,6 +757,12 @@ class ServerArgs:
715
757
  help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
716
758
  "This only affects Triton attention kernels.",
717
759
  )
760
+ parser.add_argument(
761
+ "--triton-attention-num-kv-splits",
762
+ type=int,
763
+ default=ServerArgs.triton_attention_num_kv_splits,
764
+ help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
765
+ )
718
766
  parser.add_argument(
719
767
  "--num-continuous-decode-steps",
720
768
  type=int,
@@ -755,6 +803,7 @@ class ServerArgs:
755
803
  def from_cli_args(cls, args: argparse.Namespace):
756
804
  args.tp_size = args.tensor_parallel_size
757
805
  args.dp_size = args.data_parallel_size
806
+ args.ep_size = args.expert_parallel_size
758
807
  attrs = [attr.name for attr in dataclasses.fields(cls)]
759
808
  return cls(**{attr: getattr(args, attr) for attr in attrs})
760
809
 
sglang/srt/utils.py CHANGED
@@ -92,7 +92,7 @@ def is_flashinfer_available():
92
92
  """
93
93
  if not get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"):
94
94
  return False
95
- return torch.cuda.is_available() and not is_hip()
95
+ return torch.cuda.is_available() and torch.version.cuda
96
96
 
97
97
 
98
98
  def is_ipv6(address):
@@ -169,7 +169,7 @@ def calculate_time(show=False, min_cost_ms=0.0):
169
169
  return wrapper
170
170
 
171
171
 
172
- def get_available_gpu_memory(device, gpu_id, distributed=False):
172
+ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True):
173
173
  """
174
174
  Get available memory for cuda:gpu_id device.
175
175
  When distributed is True, the available memory is the minimum available memory of all GPUs.
@@ -184,7 +184,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
184
184
  "which may cause useless memory allocation for torch CUDA context.",
185
185
  )
186
186
 
187
- torch.cuda.empty_cache()
187
+ if empty_cache:
188
+ torch.cuda.empty_cache()
188
189
  free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
189
190
 
190
191
  elif device == "xpu":
@@ -196,11 +197,25 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
196
197
  f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ",
197
198
  "which may cause useless memory allocation for torch XPU context.",
198
199
  )
199
- torch.xpu.empty_cache()
200
+
201
+ if empty_cache:
202
+ torch.xpu.empty_cache()
200
203
  used_memory = torch.xpu.memory_allocated()
201
204
  total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory
202
205
  free_gpu_memory = total_gpu_memory - used_memory
203
206
 
207
+ elif device == "hpu":
208
+ num_gpus = torch.hpu.device_count()
209
+ assert gpu_id < num_gpus
210
+
211
+ if torch.hpu.current_device() != gpu_id:
212
+ print(
213
+ f"WARNING: current device is not {gpu_id}, but {torch.hpu.current_device()}, ",
214
+ "which may cause useless memory allocation for torch HPU context.",
215
+ )
216
+
217
+ free_gpu_memory, total_gpu_memory = torch.hpu.mem_get_info()
218
+
204
219
  if distributed:
205
220
  tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
206
221
  torch.device(device, gpu_id)
@@ -939,6 +954,37 @@ def get_nvgpu_memory_capacity():
939
954
  )
940
955
 
941
956
 
957
+ def get_hpu_memory_capacity():
958
+ try:
959
+ # Run hl-smi and capture the output
960
+ result = subprocess.run(
961
+ ["hl-smi --query | grep 'Total'"],
962
+ stdout=subprocess.PIPE,
963
+ stderr=subprocess.PIPE,
964
+ shell=True,
965
+ text=True,
966
+ )
967
+
968
+ if result.returncode != 0:
969
+ raise RuntimeError(f"hl-smi error: {result.stderr.strip()}")
970
+
971
+ # Parse the output to extract memory values in MiB
972
+ memory_values = [
973
+ float(mem.split(" ")[-2]) for mem in result.stdout.strip().split("\n")
974
+ ]
975
+
976
+ if not memory_values:
977
+ raise ValueError("No GPU memory values found.")
978
+
979
+ # Return the minimum memory value
980
+ return min(memory_values)
981
+
982
+ except FileNotFoundError:
983
+ raise RuntimeError(
984
+ "hl-smi not found. Ensure Habana drivers are installed and accessible."
985
+ )
986
+
987
+
942
988
  # Copy from pytorch and OpenRLHF to allow creating multiple main groups.
943
989
  # https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
944
990
  # https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
@@ -1025,9 +1071,6 @@ def get_device_name(device_id: int = 0) -> str:
1025
1071
  if hasattr(torch, "cuda") and torch.cuda.is_available():
1026
1072
  return torch.cuda.get_device_name(device_id)
1027
1073
 
1028
- if hasattr(torch, "hip") and torch.hip.is_available():
1029
- return torch.hip.get_device_name(device_id)
1030
-
1031
1074
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1032
1075
  return torch.xpu.get_device_name(device_id)
1033
1076
 
@@ -1040,9 +1083,6 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
1040
1083
  if hasattr(torch, "cuda") and torch.cuda.is_available():
1041
1084
  major, minor = torch.cuda.get_device_capability(device_id)
1042
1085
 
1043
- if hasattr(torch, "hip") and torch.hip.is_available():
1044
- major, minor = torch.cuda.get_device_capability(device_id)
1045
-
1046
1086
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1047
1087
  major, minor, *_ = torch.xpu.get_device_capability(device_id)["version"].split(
1048
1088
  "."
@@ -1062,6 +1102,13 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
1062
1102
  return major, minor
1063
1103
 
1064
1104
 
1105
+ def get_compiler_backend() -> str:
1106
+ if hasattr(torch, "hpu") and torch.hpu.is_available():
1107
+ return "hpu_backend"
1108
+
1109
+ return "inductor"
1110
+
1111
+
1065
1112
  sglang_lib = Library("sglang", "FRAGMENT") # noqa
1066
1113
 
1067
1114
 
sglang/test/test_utils.py CHANGED
@@ -568,6 +568,7 @@ def run_bench_serving(
568
568
  disable_tqdm=False,
569
569
  disable_stream=disable_stream,
570
570
  disable_ignore_eos=False,
571
+ return_logprob=False,
571
572
  lora_name=None,
572
573
  extra_request_body=None,
573
574
  profile=None,
@@ -719,13 +720,13 @@ def run_and_check_memory_leak(
719
720
 
720
721
  # Clean up everything
721
722
  kill_process_tree(process.pid)
722
- kill_process_tree(process.pid)
723
723
  stdout.close()
724
724
  stderr.close()
725
725
  if os.path.exists(STDOUT_FILENAME):
726
726
  os.remove(STDOUT_FILENAME)
727
727
  if os.path.exists(STDERR_FILENAME):
728
728
  os.remove(STDERR_FILENAME)
729
+ kill_process_tree(process.pid)
729
730
  t.join()
730
731
 
731
732
  # Assert success
@@ -733,7 +734,7 @@ def run_and_check_memory_leak(
733
734
  has_leak = False
734
735
  has_abort = False
735
736
  for line in output_lines:
736
- if "The server is fired" in line:
737
+ if "Uvicorn running" in line:
737
738
  has_new_server = True
738
739
  if "leak" in line:
739
740
  has_leak = True
sglang/utils.py CHANGED
@@ -1,4 +1,4 @@
1
- """Common utilities."""
1
+ """Common utilities"""
2
2
 
3
3
  import base64
4
4
  import gc
@@ -79,7 +79,14 @@ class HttpResponse:
79
79
  return self.resp.status
80
80
 
81
81
 
82
- def http_request(url, json=None, stream=False, api_key=None, verify=None):
82
+ def http_request(
83
+ url,
84
+ json=None,
85
+ stream=False,
86
+ api_key=None,
87
+ verify=None,
88
+ method: Optional[str] = None,
89
+ ):
83
90
  """A faster version of requests.post with low-level urllib API."""
84
91
  headers = {"Content-Type": "application/json; charset=utf-8"}
85
92
 
@@ -90,7 +97,7 @@ def http_request(url, json=None, stream=False, api_key=None, verify=None):
90
97
  if stream:
91
98
  return requests.post(url, json=json, stream=True, headers=headers)
92
99
  else:
93
- req = urllib.request.Request(url, headers=headers)
100
+ req = urllib.request.Request(url, headers=headers, method=method)
94
101
  if json is None:
95
102
  data = None
96
103
  else:
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.0"
1
+ __version__ = "0.4.0.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.0
3
+ Version: 0.4.0.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -215,6 +215,7 @@ Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
216
  Requires-Dist: numpy
217
217
  Requires-Dist: IPython
218
+ Requires-Dist: setproctitle
218
219
  Provides-Extra: runtime-common
219
220
  Requires-Dist: aiohttp; extra == "runtime-common"
220
221
  Requires-Dist: decord; extra == "runtime-common"
@@ -232,16 +233,17 @@ Requires-Dist: psutil; extra == "runtime-common"
232
233
  Requires-Dist: pydantic; extra == "runtime-common"
233
234
  Requires-Dist: python-multipart; extra == "runtime-common"
234
235
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
235
- Requires-Dist: torchao; extra == "runtime-common"
236
+ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
237
+ Requires-Dist: gemlite; extra == "runtime-common"
236
238
  Requires-Dist: uvicorn; extra == "runtime-common"
237
239
  Requires-Dist: uvloop; extra == "runtime-common"
238
- Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
240
+ Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
239
241
  Provides-Extra: srt
240
242
  Requires-Dist: sglang[runtime_common]; extra == "srt"
241
243
  Requires-Dist: torch; extra == "srt"
242
- Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
244
+ Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
243
245
  Requires-Dist: cuda-python; extra == "srt"
244
- Requires-Dist: flashinfer>=0.1.6; extra == "srt"
246
+ Requires-Dist: flashinfer==0.1.6; extra == "srt"
245
247
  Provides-Extra: srt-hip
246
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
247
249
  Requires-Dist: torch; extra == "srt-hip"
@@ -311,10 +313,14 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
311
313
 
312
314
  --------------------------------------------------------------------------------
313
315
 
314
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
315
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
316
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
317
+ | [**Documentation**](https://sgl-project.github.io/)
318
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
319
+ | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
320
+ | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
316
321
 
317
322
  ## News
323
+ - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
318
324
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
319
325
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
320
326
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -346,13 +352,13 @@ The core features include:
346
352
  - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
347
353
 
348
354
  ## Benchmark And Performance
349
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
355
+ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
350
356
 
351
357
  ## Roadmap
352
358
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
353
359
 
354
360
  ## Adoption and Sponsorship
355
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
361
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
356
362
 
357
363
  ## Acknowledgment and Citation
358
364
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).