sglang 0.4.2.post4__py3-none-any.whl → 0.4.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. sglang/global_config.py +2 -0
  2. sglang/lang/backend/openai.py +5 -0
  3. sglang/lang/chat_template.py +22 -7
  4. sglang/lang/ir.py +1 -0
  5. sglang/srt/configs/__init__.py +6 -3
  6. sglang/srt/configs/model_config.py +2 -0
  7. sglang/srt/configs/qwen2_5_vl_config.py +1003 -0
  8. sglang/srt/entrypoints/engine.py +18 -3
  9. sglang/srt/hf_transformers_utils.py +2 -3
  10. sglang/srt/layers/attention/flashinfer_backend.py +235 -110
  11. sglang/srt/layers/attention/triton_backend.py +358 -72
  12. sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
  13. sglang/srt/layers/linear.py +12 -5
  14. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +2 -2
  15. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  16. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +2 -2
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  18. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +178 -0
  21. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +175 -0
  23. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -2
  24. sglang/srt/layers/moe/fused_moe_triton/layer.py +2 -0
  25. sglang/srt/layers/moe/topk.py +1 -1
  26. sglang/srt/layers/quantization/__init__.py +51 -5
  27. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  28. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +30 -30
  29. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  30. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +29 -29
  31. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  32. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +33 -33
  33. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  34. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +31 -31
  35. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  36. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +27 -27
  37. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  38. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +31 -31
  39. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  40. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +24 -24
  41. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  42. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +30 -30
  43. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  44. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +42 -42
  45. sglang/srt/layers/quantization/fp8_kernel.py +123 -17
  46. sglang/srt/layers/quantization/fp8_utils.py +33 -4
  47. sglang/srt/managers/detokenizer_manager.py +1 -0
  48. sglang/srt/managers/image_processor.py +217 -122
  49. sglang/srt/managers/io_struct.py +4 -0
  50. sglang/srt/managers/schedule_batch.py +16 -3
  51. sglang/srt/managers/scheduler.py +29 -0
  52. sglang/srt/managers/tokenizer_manager.py +6 -0
  53. sglang/srt/managers/tp_worker_overlap_thread.py +4 -0
  54. sglang/srt/model_executor/cuda_graph_runner.py +12 -1
  55. sglang/srt/model_executor/forward_batch_info.py +4 -1
  56. sglang/srt/model_executor/model_runner.py +12 -2
  57. sglang/srt/models/deepseek_nextn.py +295 -0
  58. sglang/srt/models/deepseek_v2.py +21 -8
  59. sglang/srt/models/llava.py +2 -1
  60. sglang/srt/models/qwen2_5_vl.py +722 -0
  61. sglang/srt/models/qwen2_vl.py +2 -1
  62. sglang/srt/openai_api/adapter.py +17 -3
  63. sglang/srt/server_args.py +26 -4
  64. sglang/srt/speculative/eagle_worker.py +35 -10
  65. sglang/srt/speculative/spec_info.py +11 -1
  66. sglang/srt/utils.py +7 -0
  67. sglang/utils.py +99 -19
  68. sglang/version.py +1 -1
  69. {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/METADATA +5 -4
  70. {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/RECORD +73 -55
  71. sglang/srt/configs/qwen2vl.py +0 -130
  72. {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/LICENSE +0 -0
  73. {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/WHEEL +0 -0
  74. {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/top_level.txt +0 -0
@@ -31,8 +31,9 @@ import torch
31
31
  import torch.nn as nn
32
32
  import torch.nn.functional as F
33
33
  from einops import rearrange
34
+ from transformers import Qwen2VLConfig
35
+ from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
34
36
 
35
- from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
36
37
  from sglang.srt.hf_transformers_utils import get_processor
37
38
  from sglang.srt.layers.activation import QuickGELU
38
39
  from sglang.srt.layers.attention.vision import VisionAttention
@@ -20,12 +20,14 @@ import os
20
20
  import time
21
21
  import uuid
22
22
  from http import HTTPStatus
23
- from typing import Dict, List, Optional
23
+ from typing import Dict, List
24
24
 
25
25
  from fastapi import HTTPException, Request, UploadFile
26
26
  from fastapi.responses import ORJSONResponse, StreamingResponse
27
27
  from pydantic import ValidationError
28
28
 
29
+ from sglang.lang.chat_template import get_chat_template_by_model_path
30
+
29
31
  try:
30
32
  from outlines.fsm.json_schema import convert_json_schema_to_str
31
33
  except ImportError:
@@ -92,7 +94,6 @@ file_id_response: Dict[str, FileResponse] = {}
92
94
  # map file id to file path in SGLang backend
93
95
  file_id_storage: Dict[str, str] = {}
94
96
 
95
-
96
97
  # backend storage directory
97
98
  storage_dir = None
98
99
 
@@ -116,12 +117,13 @@ def create_streaming_error_response(
116
117
  return json_str
117
118
 
118
119
 
119
- def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
120
+ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, model_path):
120
121
  global chat_template_name
121
122
 
122
123
  logger.info(
123
124
  f"Use chat template for the OpenAI-compatible API server: {chat_template_arg}"
124
125
  )
126
+
125
127
  if not chat_template_exists(chat_template_arg):
126
128
  if not os.path.exists(chat_template_arg):
127
129
  raise RuntimeError(
@@ -163,6 +165,18 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
163
165
  else:
164
166
  chat_template_name = chat_template_arg
165
167
 
168
+ # check chat-template
169
+ chat_template = get_chat_template_by_model_path(model_path)
170
+ if chat_template is not None:
171
+ official_chat_template = chat_template.name
172
+ used_chat_template = chat_template_name
173
+ if official_chat_template != used_chat_template:
174
+ logger.warning(
175
+ f"Using a chat_template: '{used_chat_template}', "
176
+ f"which is different from official chat template: '{official_chat_template}', "
177
+ f"This discrepancy may lead to performance degradation."
178
+ )
179
+
166
180
 
167
181
  async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str = None):
168
182
  try:
sglang/srt/server_args.py CHANGED
@@ -140,6 +140,7 @@ class ServerArgs:
140
140
  disable_jump_forward: bool = False
141
141
  disable_cuda_graph: bool = False
142
142
  disable_cuda_graph_padding: bool = False
143
+ enable_nccl_nvls: bool = False
143
144
  disable_outlines_disk_cache: bool = False
144
145
  disable_custom_all_reduce: bool = False
145
146
  disable_mla: bool = False
@@ -160,12 +161,15 @@ class ServerArgs:
160
161
  delete_ckpt_after_loading: bool = False
161
162
  enable_memory_saver: bool = False
162
163
  allow_auto_truncate: bool = False
164
+ return_hidden_states: bool = False
163
165
 
164
166
  # Custom logit processor
165
167
  enable_custom_logit_processor: bool = False
166
168
  tool_call_parser: str = None
167
169
  enable_hierarchical_cache: bool = False
168
170
 
171
+ enable_flashinfer_mla: bool = False
172
+
169
173
  def __post_init__(self):
170
174
  # Set missing default values
171
175
  if self.tokenizer_path is None:
@@ -258,14 +262,17 @@ class ServerArgs:
258
262
  )
259
263
 
260
264
  # Speculative Decoding
261
- if self.speculative_algorithm == "EAGLE":
265
+ if (
266
+ self.speculative_algorithm == "EAGLE"
267
+ or self.speculative_algorithm == "NEXTN"
268
+ ):
262
269
  self.prefill_only_one_req = True
263
270
  self.disable_cuda_graph_padding = True
264
271
  self.disable_radix_cache = True
265
272
  self.disable_overlap_schedule = True
266
273
  self.chunked_prefill_size = -1
267
274
  logger.info(
268
- "The radix cache, chunked prefill, and overlap scheduler are disabled because of using eagle speculative decoding."
275
+ f"The radix cache, chunked prefill, and overlap scheduler are disabled because of using {self.speculative_algorithm} speculative decoding."
269
276
  )
270
277
 
271
278
  # GGUF
@@ -691,12 +698,17 @@ class ServerArgs:
691
698
  default=ServerArgs.grammar_backend,
692
699
  help="Choose the backend for grammar-guided decoding.",
693
700
  )
701
+ parser.add_argument(
702
+ "--enable-flashinfer-mla",
703
+ action="store_true",
704
+ help="Enable FlashInfer MLA optimization",
705
+ )
694
706
 
695
707
  # Speculative decoding
696
708
  parser.add_argument(
697
709
  "--speculative-algorithm",
698
710
  type=str,
699
- choices=["EAGLE"],
711
+ choices=["EAGLE", "NEXTN"],
700
712
  help="Speculative algorithm.",
701
713
  )
702
714
  parser.add_argument(
@@ -782,6 +794,11 @@ class ServerArgs:
782
794
  action="store_true",
783
795
  help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
784
796
  )
797
+ parser.add_argument(
798
+ "--enable-nccl-nvls",
799
+ action="store_true",
800
+ help="Enable NCCL NVLS for prefill heavy requests when available.",
801
+ )
785
802
  parser.add_argument(
786
803
  "--disable-outlines-disk-cache",
787
804
  action="store_true",
@@ -795,7 +812,7 @@ class ServerArgs:
795
812
  parser.add_argument(
796
813
  "--disable-mla",
797
814
  action="store_true",
798
- help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
815
+ help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
799
816
  )
800
817
  parser.add_argument(
801
818
  "--disable-overlap-schedule",
@@ -896,6 +913,11 @@ class ServerArgs:
896
913
  action="store_true",
897
914
  help="Enable users to pass custom logit processors to the server (disabled by default for security)",
898
915
  )
916
+ parser.add_argument(
917
+ "--return-hidden-states",
918
+ action="store_true",
919
+ help="Return hidden states in the response.",
920
+ )
899
921
  # Function Calling
900
922
  parser.add_argument(
901
923
  "--tool-call-parser",
@@ -24,6 +24,7 @@ from sglang.srt.speculative.eagle_utils import (
24
24
  fast_topk,
25
25
  select_top_k_tokens,
26
26
  )
27
+ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
27
28
 
28
29
  logger = logging.getLogger(__name__)
29
30
 
@@ -57,23 +58,43 @@ class EAGLEWorker(TpModelWorker):
57
58
  # Parse arguments
58
59
  self.topk = server_args.speculative_eagle_topk
59
60
  self.speculative_num_steps = server_args.speculative_num_steps
61
+ self.speculative_algorithm = SpeculativeAlgorithm.from_string(
62
+ server_args.speculative_algorithm
63
+ )
60
64
  self.server_args = server_args
61
65
 
62
66
  # Share the embedding and lm_head
63
- embed, head = self.target_worker.model_runner.model.get_embed_and_head()
64
- self.model_runner.model.set_embed_and_head(embed, head)
67
+ if not self.speculative_algorithm.is_nextn():
68
+ embed, head = self.target_worker.model_runner.model.get_embed_and_head()
69
+ self.model_runner.model.set_embed_and_head(embed, head)
65
70
  self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
66
71
 
67
72
  # Create multi-step attn backends and cuda graph runners
68
- from sglang.srt.layers.attention.flashinfer_backend import (
69
- FlashInferMultiStepDraftBackend,
70
- )
73
+ if server_args.attention_backend == "flashinfer":
74
+ from sglang.srt.layers.attention.flashinfer_backend import (
75
+ FlashInferMultiStepDraftBackend,
76
+ )
77
+
78
+ self.draft_attn_backend = FlashInferMultiStepDraftBackend(
79
+ self.model_runner,
80
+ self.topk,
81
+ self.speculative_num_steps,
82
+ )
83
+ elif server_args.attention_backend == "triton":
84
+ from sglang.srt.layers.attention.triton_backend import (
85
+ TritonMultiStepDraftBackend,
86
+ )
87
+
88
+ self.draft_attn_backend = TritonMultiStepDraftBackend(
89
+ self.model_runner,
90
+ self.topk,
91
+ self.speculative_num_steps,
92
+ )
93
+ else:
94
+ raise ValueError(
95
+ f"EAGLE is not supportted in attention backend {server_args.attention_backend}"
96
+ )
71
97
 
72
- self.draft_attn_backend = FlashInferMultiStepDraftBackend(
73
- self.model_runner,
74
- self.topk,
75
- self.speculative_num_steps,
76
- )
77
98
  self.model_runner.draft_attn_backend = self.draft_attn_backend
78
99
  self.init_cuda_graphs()
79
100
 
@@ -218,6 +239,10 @@ class EAGLEWorker(TpModelWorker):
218
239
  token_list.append(tree_info[1])
219
240
  parents_list.append(tree_info[2])
220
241
 
242
+ # we don't need to run the last forward. we get 1 token from draft prefill and (#spec steps - 1) tokens here
243
+ if i == self.speculative_num_steps - 1:
244
+ break
245
+
221
246
  # Set inputs
222
247
  forward_batch.input_ids = input_ids
223
248
  forward_batch.out_cache_loc = out_cache_loc[
@@ -5,18 +5,28 @@ class SpeculativeAlgorithm(IntEnum):
5
5
  NONE = auto()
6
6
  EAGLE = auto()
7
7
 
8
+ # NEXTN spec decoding is for DeepSeek V3/R1
9
+ # currently it's implemented based on EAGLE
10
+ NEXTN = auto()
11
+
8
12
  def is_none(self):
9
13
  return self == SpeculativeAlgorithm.NONE
10
14
 
11
15
  def is_eagle(self):
12
- return self == SpeculativeAlgorithm.EAGLE
16
+ return self == SpeculativeAlgorithm.EAGLE or self == SpeculativeAlgorithm.NEXTN
17
+
18
+ def is_nextn(self):
19
+ return self == SpeculativeAlgorithm.NEXTN
13
20
 
14
21
  @staticmethod
15
22
  def from_string(name: str):
16
23
  name_map = {
17
24
  "EAGLE": SpeculativeAlgorithm.EAGLE,
25
+ "NEXTN": SpeculativeAlgorithm.NEXTN,
18
26
  None: SpeculativeAlgorithm.NONE,
19
27
  }
28
+ if name is not None:
29
+ name = name.upper()
20
30
  return name_map[name]
21
31
 
22
32
 
sglang/srt/utils.py CHANGED
@@ -1444,3 +1444,10 @@ def launch_dummy_health_check_server(host, port):
1444
1444
  timeout_keep_alive=5,
1445
1445
  loop="uvloop",
1446
1446
  )
1447
+
1448
+
1449
+ def set_cuda_arch():
1450
+ if is_flashinfer_available():
1451
+ capability = torch.cuda.get_device_capability()
1452
+ arch = f"{capability[0]}.{capability[1]}"
1453
+ os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
sglang/utils.py CHANGED
@@ -306,22 +306,112 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
306
306
  return filename
307
307
 
308
308
 
309
- def execute_shell_command(command: str) -> subprocess.Popen:
309
+ import fcntl
310
+
311
+
312
+ def is_in_ci():
313
+ from sglang.test.test_utils import is_in_ci
314
+
315
+ return is_in_ci()
316
+
317
+
318
+ LOCKFILE = os.path.expanduser("~/.sglang_port_lock")
319
+ PORT_REGISTRY = os.path.expanduser("~/.sglang_port_registry.json")
320
+
321
+ if not os.path.exists(LOCKFILE):
322
+ with open(LOCKFILE, "w") as f:
323
+ pass
324
+
325
+ if not os.path.exists(PORT_REGISTRY):
326
+ with open(PORT_REGISTRY, "w") as f:
327
+ json.dump([], f)
328
+
329
+
330
+ def print_highlight(html_content: str):
331
+ if is_in_ci():
332
+ html_content = str(html_content).replace("\n", "<br>")
333
+ display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
334
+ else:
335
+ print(html_content)
336
+
337
+
338
+ def init_port_registry():
339
+ """Initialize the port registry file if it doesn't exist."""
340
+ if not os.path.exists(PORT_REGISTRY):
341
+ with open(PORT_REGISTRY, "w") as f:
342
+ json.dump([], f)
343
+
344
+
345
+ def reserve_port(start=30000, end=40000):
346
+ """
347
+ Reserve an available port using a file lock and a registry.
348
+ Returns the allocated port.
310
349
  """
311
- Execute a shell command and return the process handle
350
+ init_port_registry()
351
+ with open(LOCKFILE, "w") as lock:
352
+ fcntl.flock(lock, fcntl.LOCK_EX)
353
+ try:
354
+ with open(PORT_REGISTRY, "r") as f:
355
+ used = json.load(f)
356
+ except Exception:
357
+ used = []
358
+ for port in range(start, end):
359
+ if port not in used:
360
+ used.append(port)
361
+ with open(PORT_REGISTRY, "w") as f:
362
+ json.dump(used, f)
363
+ return port
364
+ raise RuntimeError("No free port available")
365
+
366
+
367
+ def release_port(port):
368
+ """Release the reserved port by removing it from the registry."""
369
+ with open(LOCKFILE, "w") as lock:
370
+ fcntl.flock(lock, fcntl.LOCK_EX)
371
+ try:
372
+ with open(PORT_REGISTRY, "r") as f:
373
+ used = json.load(f)
374
+ except Exception:
375
+ used = []
376
+ if port in used:
377
+ used.remove(port)
378
+ with open(PORT_REGISTRY, "w") as f:
379
+ json.dump(used, f)
312
380
 
313
- Args:
314
- command: Shell command as a string (can include \\ line continuations)
315
- Returns:
316
- subprocess.Popen: Process handle
381
+
382
+ def execute_shell_command(command: str) -> subprocess.Popen:
317
383
  """
318
- # Replace \ newline with space and split
384
+ Execute a shell command and return its process handle.
385
+ """
386
+ # Replace newline continuations and split the command string.
319
387
  command = command.replace("\\\n", " ").replace("\\", " ")
320
388
  parts = command.split()
321
-
322
389
  return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
323
390
 
324
391
 
392
+ def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
393
+ """
394
+ Launch the server using the given command.
395
+ If no port is specified, a free port is reserved.
396
+ """
397
+ if port is None:
398
+ port = reserve_port()
399
+ full_command = f"{command} --port {port}"
400
+ process = execute_shell_command(full_command)
401
+ return process, port
402
+
403
+
404
+ def terminate_process(process, port=None):
405
+ """
406
+ Terminate the process and, if a port was reserved, release it.
407
+ """
408
+ from sglang.srt.utils import kill_process_tree
409
+
410
+ kill_process_tree(process.pid)
411
+ if port is not None:
412
+ release_port(port)
413
+
414
+
325
415
  def wait_for_server(base_url: str, timeout: int = None) -> None:
326
416
  """Wait for the server to be ready by polling the /v1/models endpoint.
327
417
 
@@ -343,6 +433,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
343
433
  NOTE: Typically, the server runs in a separate terminal.
344
434
  In this notebook, we run the server and notebook code together, so their outputs are combined.
345
435
  To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
436
+ We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
346
437
  """
347
438
  )
348
439
  break
@@ -353,17 +444,6 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
353
444
  time.sleep(1)
354
445
 
355
446
 
356
- def terminate_process(process):
357
- from sglang.srt.utils import kill_process_tree
358
-
359
- kill_process_tree(process.pid)
360
-
361
-
362
- def print_highlight(html_content: str):
363
- html_content = str(html_content).replace("\n", "<br>")
364
- display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
365
-
366
-
367
447
  class TypeBasedDispatcher:
368
448
  def __init__(self, mapping: List[Tuple[Type, Callable]]):
369
449
  self._mapping = mapping
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.2.post4"
1
+ __version__ = "0.4.3.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.2.post4
3
+ Version: 0.4.3.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -235,14 +235,15 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
235
235
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
236
236
  Requires-Dist: uvicorn; extra == "runtime-common"
237
237
  Requires-Dist: uvloop; extra == "runtime-common"
238
- Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
238
+ Requires-Dist: xgrammar==0.1.10; extra == "runtime-common"
239
+ Requires-Dist: ninja; extra == "runtime-common"
239
240
  Provides-Extra: srt
240
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
241
242
  Requires-Dist: cuda-python; extra == "srt"
242
- Requires-Dist: sgl-kernel>=0.0.3.post3; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.3.post6; extra == "srt"
243
244
  Requires-Dist: torch; extra == "srt"
244
245
  Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
245
- Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"
246
+ Requires-Dist: flashinfer_python>=0.2.1.post1; extra == "srt"
246
247
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
247
248
  Provides-Extra: srt-hip
248
249
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"