sglang 0.4.1.post6__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. sglang/__init__.py +21 -23
  2. sglang/api.py +2 -7
  3. sglang/bench_offline_throughput.py +24 -16
  4. sglang/bench_one_batch.py +51 -3
  5. sglang/bench_one_batch_server.py +1 -1
  6. sglang/bench_serving.py +37 -28
  7. sglang/lang/backend/runtime_endpoint.py +183 -4
  8. sglang/lang/chat_template.py +15 -4
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/_custom_ops.py +80 -42
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/model_config.py +1 -0
  13. sglang/srt/constrained/base_grammar_backend.py +21 -0
  14. sglang/srt/constrained/xgrammar_backend.py +8 -4
  15. sglang/srt/conversation.py +14 -1
  16. sglang/srt/distributed/__init__.py +3 -3
  17. sglang/srt/distributed/communication_op.py +2 -1
  18. sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
  19. sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
  20. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  21. sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
  22. sglang/srt/distributed/device_communicators/pynccl.py +80 -1
  23. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
  24. sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
  25. sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
  26. sglang/srt/distributed/parallel_state.py +1 -1
  27. sglang/srt/distributed/utils.py +2 -1
  28. sglang/srt/entrypoints/engine.py +449 -0
  29. sglang/srt/entrypoints/http_server.py +579 -0
  30. sglang/srt/layers/activation.py +3 -3
  31. sglang/srt/layers/attention/flashinfer_backend.py +10 -9
  32. sglang/srt/layers/attention/triton_backend.py +4 -6
  33. sglang/srt/layers/attention/vision.py +204 -0
  34. sglang/srt/layers/dp_attention.py +69 -0
  35. sglang/srt/layers/linear.py +41 -5
  36. sglang/srt/layers/logits_processor.py +48 -63
  37. sglang/srt/layers/moe/ep_moe/layer.py +4 -4
  38. sglang/srt/layers/moe/fused_moe_native.py +69 -0
  39. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
  40. sglang/srt/layers/moe/fused_moe_triton/layer.py +29 -5
  41. sglang/srt/layers/parameter.py +2 -1
  42. sglang/srt/layers/quantization/__init__.py +20 -23
  43. sglang/srt/layers/quantization/fp8.py +6 -3
  44. sglang/srt/layers/quantization/modelopt_quant.py +1 -2
  45. sglang/srt/layers/quantization/w8a8_int8.py +1 -1
  46. sglang/srt/layers/radix_attention.py +2 -2
  47. sglang/srt/layers/rotary_embedding.py +1179 -31
  48. sglang/srt/layers/sampler.py +39 -1
  49. sglang/srt/layers/vocab_parallel_embedding.py +2 -2
  50. sglang/srt/lora/lora.py +1 -9
  51. sglang/srt/managers/configure_logging.py +3 -0
  52. sglang/srt/managers/data_parallel_controller.py +79 -72
  53. sglang/srt/managers/detokenizer_manager.py +23 -6
  54. sglang/srt/managers/image_processor.py +158 -2
  55. sglang/srt/managers/io_struct.py +25 -2
  56. sglang/srt/managers/schedule_batch.py +49 -22
  57. sglang/srt/managers/schedule_policy.py +26 -12
  58. sglang/srt/managers/scheduler.py +277 -178
  59. sglang/srt/managers/session_controller.py +1 -0
  60. sglang/srt/managers/tokenizer_manager.py +206 -121
  61. sglang/srt/managers/tp_worker.py +6 -4
  62. sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
  63. sglang/srt/managers/utils.py +44 -0
  64. sglang/srt/mem_cache/memory_pool.py +10 -32
  65. sglang/srt/metrics/collector.py +15 -6
  66. sglang/srt/model_executor/cuda_graph_runner.py +4 -6
  67. sglang/srt/model_executor/model_runner.py +37 -15
  68. sglang/srt/model_loader/loader.py +8 -6
  69. sglang/srt/model_loader/weight_utils.py +55 -2
  70. sglang/srt/models/baichuan.py +6 -6
  71. sglang/srt/models/chatglm.py +2 -2
  72. sglang/srt/models/commandr.py +3 -3
  73. sglang/srt/models/dbrx.py +4 -4
  74. sglang/srt/models/deepseek.py +3 -3
  75. sglang/srt/models/deepseek_v2.py +8 -8
  76. sglang/srt/models/exaone.py +2 -2
  77. sglang/srt/models/gemma.py +2 -2
  78. sglang/srt/models/gemma2.py +6 -24
  79. sglang/srt/models/gpt2.py +3 -5
  80. sglang/srt/models/gpt_bigcode.py +1 -1
  81. sglang/srt/models/granite.py +2 -2
  82. sglang/srt/models/grok.py +3 -3
  83. sglang/srt/models/internlm2.py +2 -2
  84. sglang/srt/models/llama.py +7 -5
  85. sglang/srt/models/minicpm.py +2 -2
  86. sglang/srt/models/minicpm3.py +6 -6
  87. sglang/srt/models/minicpmv.py +1238 -0
  88. sglang/srt/models/mixtral.py +3 -3
  89. sglang/srt/models/mixtral_quant.py +3 -3
  90. sglang/srt/models/mllama.py +2 -2
  91. sglang/srt/models/olmo.py +3 -3
  92. sglang/srt/models/olmo2.py +4 -4
  93. sglang/srt/models/olmoe.py +7 -13
  94. sglang/srt/models/phi3_small.py +2 -2
  95. sglang/srt/models/qwen.py +2 -2
  96. sglang/srt/models/qwen2.py +41 -4
  97. sglang/srt/models/qwen2_moe.py +3 -3
  98. sglang/srt/models/qwen2_vl.py +22 -122
  99. sglang/srt/models/stablelm.py +2 -2
  100. sglang/srt/models/torch_native_llama.py +3 -3
  101. sglang/srt/models/xverse.py +6 -6
  102. sglang/srt/models/xverse_moe.py +6 -6
  103. sglang/srt/openai_api/protocol.py +2 -0
  104. sglang/srt/sampling/custom_logit_processor.py +38 -0
  105. sglang/srt/sampling/sampling_batch_info.py +139 -4
  106. sglang/srt/sampling/sampling_params.py +3 -1
  107. sglang/srt/server.py +4 -1090
  108. sglang/srt/server_args.py +57 -14
  109. sglang/srt/utils.py +103 -65
  110. sglang/test/runners.py +8 -13
  111. sglang/test/test_programs.py +1 -1
  112. sglang/test/test_utils.py +3 -1
  113. sglang/utils.py +12 -2
  114. sglang/version.py +1 -1
  115. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +16 -5
  116. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +119 -115
  117. sglang/launch_server_llavavid.py +0 -25
  118. sglang/srt/constrained/__init__.py +0 -16
  119. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  120. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
  121. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
  122. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -29,8 +29,8 @@ from sglang.srt.utils import (
29
29
  get_nvgpu_memory_capacity,
30
30
  is_flashinfer_available,
31
31
  is_hip,
32
- is_ipv6,
33
32
  is_port_available,
33
+ is_valid_ipv6_address,
34
34
  nullable_str,
35
35
  )
36
36
 
@@ -157,6 +157,10 @@ class ServerArgs:
157
157
  num_continuous_decode_steps: int = 1
158
158
  delete_ckpt_after_loading: bool = False
159
159
  enable_memory_saver: bool = False
160
+ allow_auto_truncate: bool = False
161
+
162
+ # Custom logit processor
163
+ enable_custom_logit_processor: bool = False
160
164
 
161
165
  def __post_init__(self):
162
166
  # Set missing default values
@@ -240,14 +244,13 @@ class ServerArgs:
240
244
  # Others
241
245
  if self.enable_dp_attention:
242
246
  self.dp_size = self.tp_size
247
+ assert self.tp_size % self.dp_size == 0
243
248
  self.chunked_prefill_size = self.chunked_prefill_size // 2
244
249
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
245
- self.disable_overlap_schedule = True
246
250
  logger.warning(
247
251
  f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
248
252
  f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
249
253
  "Data parallel size is adjusted to be the same as tensor parallel size. "
250
- "Overlap scheduler is disabled."
251
254
  )
252
255
 
253
256
  # Speculative Decoding
@@ -392,7 +395,7 @@ class ServerArgs:
392
395
  "--device",
393
396
  type=str,
394
397
  default="cuda",
395
- choices=["cuda", "xpu", "hpu"],
398
+ choices=["cuda", "xpu", "hpu", "cpu"],
396
399
  help="The device type.",
397
400
  )
398
401
  parser.add_argument(
@@ -860,6 +863,16 @@ class ServerArgs:
860
863
  action="store_true",
861
864
  help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
862
865
  )
866
+ parser.add_argument(
867
+ "--allow-auto-truncate",
868
+ action="store_true",
869
+ help="Allow automatically truncating requests that exceed the maximum input length instead of returning an error.",
870
+ )
871
+ parser.add_argument(
872
+ "--enable-custom-logit-processor",
873
+ action="store_true",
874
+ help="Enable users to pass custom logit processors to the server (disabled by default for security)",
875
+ )
863
876
 
864
877
  @classmethod
865
878
  def from_cli_args(cls, args: argparse.Namespace):
@@ -870,7 +883,7 @@ class ServerArgs:
870
883
  return cls(**{attr: getattr(args, attr) for attr in attrs})
871
884
 
872
885
  def url(self):
873
- if is_ipv6(self.host):
886
+ if is_valid_ipv6_address(self.host):
874
887
  return f"http://[{self.host}]:{self.port}"
875
888
  else:
876
889
  return f"http://{self.host}:{self.port}"
@@ -880,8 +893,8 @@ class ServerArgs:
880
893
  self.tp_size % self.nnodes == 0
881
894
  ), "tp_size must be divisible by number of nodes"
882
895
  assert not (
883
- self.dp_size > 1 and self.nnodes != 1
884
- ), "multi-node data parallel is not supported"
896
+ self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
897
+ ), "multi-node data parallel is not supported unless dp attention!"
885
898
  assert (
886
899
  self.max_loras_per_batch > 0
887
900
  # FIXME
@@ -919,6 +932,9 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
919
932
  return server_args
920
933
 
921
934
 
935
+ ZMQ_TCP_PORT_DELTA = 233
936
+
937
+
922
938
  @dataclasses.dataclass
923
939
  class PortArgs:
924
940
  # The ipc filename for tokenizer to receive inputs from detokenizer (zmq)
@@ -932,7 +948,7 @@ class PortArgs:
932
948
  nccl_port: int
933
949
 
934
950
  @staticmethod
935
- def init_new(server_args) -> "PortArgs":
951
+ def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
936
952
  port = server_args.port + random.randint(100, 1000)
937
953
  while True:
938
954
  if is_port_available(port):
@@ -942,12 +958,39 @@ class PortArgs:
942
958
  else:
943
959
  port -= 43
944
960
 
945
- return PortArgs(
946
- tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
947
- scheduler_input_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
948
- detokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
949
- nccl_port=port,
950
- )
961
+ if not server_args.enable_dp_attention:
962
+ # Normal case, use IPC within a single node
963
+ return PortArgs(
964
+ tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
965
+ scheduler_input_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
966
+ detokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
967
+ nccl_port=port,
968
+ )
969
+ else:
970
+ # DP attention. Use TCP + port to handle both single-node and multi-node.
971
+ if server_args.nnodes == 1 and server_args.dist_init_addr is None:
972
+ dist_init_addr = ("127.0.0.1", server_args.port + ZMQ_TCP_PORT_DELTA)
973
+ else:
974
+ dist_init_addr = server_args.dist_init_addr.split(":")
975
+ assert (
976
+ len(dist_init_addr) == 2
977
+ ), "please provide --dist-init-addr as host:port of head node"
978
+
979
+ dist_init_host, dist_init_port = dist_init_addr
980
+ port_base = int(dist_init_port) + 1
981
+ if dp_rank is None:
982
+ scheduler_input_port = (
983
+ port_base + 2
984
+ ) # TokenizerManager to DataParallelController
985
+ else:
986
+ scheduler_input_port = port_base + 2 + 1 + dp_rank
987
+
988
+ return PortArgs(
989
+ tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
990
+ scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
991
+ detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
992
+ nccl_port=port,
993
+ )
951
994
 
952
995
 
953
996
  class LoRAPathAction(argparse.Action):
sglang/srt/utils.py CHANGED
@@ -59,6 +59,7 @@ from triton.runtime.cache import (
59
59
  default_dump_dir,
60
60
  default_override_dir,
61
61
  )
62
+ from uvicorn.config import LOGGING_CONFIG
62
63
 
63
64
  logger = logging.getLogger(__name__)
64
65
 
@@ -101,14 +102,6 @@ def is_cuda_available():
101
102
  return torch.cuda.is_available() and torch.version.cuda
102
103
 
103
104
 
104
- def is_ipv6(address):
105
- try:
106
- ipaddress.IPv6Address(address)
107
- return True
108
- except ipaddress.AddressValueError:
109
- return False
110
-
111
-
112
105
  def enable_show_time_cost():
113
106
  global show_time_cost
114
107
  show_time_cost = True
@@ -222,6 +215,10 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
222
215
 
223
216
  free_gpu_memory, total_gpu_memory = torch.hpu.mem_get_info()
224
217
 
218
+ elif device == "cpu":
219
+ # TODO: rename the variables in the current function to be not GPU specific
220
+ free_gpu_memory = psutil.virtual_memory().available
221
+
225
222
  if distributed:
226
223
  tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
227
224
  torch.device(device, gpu_id)
@@ -446,6 +443,8 @@ def load_image(image_file: Union[str, bytes]):
446
443
  else:
447
444
  raise ValueError(f"Invalid image: {image}")
448
445
 
446
+ # if image_size is None:
447
+ # image_size = image.size
449
448
  return image, image_size
450
449
 
451
450
 
@@ -511,76 +510,32 @@ def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = N
511
510
  pass
512
511
 
513
512
 
514
- def monkey_patch_vllm_p2p_access_check(gpu_id: int):
513
+ def monkey_patch_p2p_access_check():
515
514
  """
516
- Monkey patch the slow p2p access check in vllm.
515
+ Monkey patch the slow p2p access check.
517
516
  NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
518
517
  """
519
518
 
520
- import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
519
+ import sglang.srt.distributed.device_communicators.custom_all_reduce_utils as tgt
521
520
 
522
521
  setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
523
522
 
524
523
  # Suppress the warnings from this delete function when using sglang.bench_one_batch
525
- from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
524
+ from sglang.srt.distributed.device_communicators.custom_all_reduce import (
525
+ CustomAllreduce,
526
+ )
526
527
 
527
528
  setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
528
529
 
529
530
 
530
- vllm_all_gather_backup = None
531
-
532
-
533
- def monkey_patch_vllm_all_gather(reverse: bool = False):
534
- """Monkey patch all-gather to remove in-place operations."""
535
- from torch.distributed import _functional_collectives as funcol
536
- from vllm.distributed.parallel_state import GroupCoordinator
537
-
538
- global vllm_all_gather_backup
539
- if vllm_all_gather_backup is None:
540
- vllm_all_gather_backup = GroupCoordinator.all_gather
541
-
542
- def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
543
- world_size = self.world_size
544
- # Bypass the function if we are using only 1 GPU.
545
- if world_size == 1:
546
- return input_
547
- assert (
548
- -input_.dim() <= dim < input_.dim()
549
- ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
550
- if dim < 0:
551
- # Convert negative dim to positive.
552
- dim += input_.dim()
553
- input_size = input_.size()
554
- # Allocate output tensor.
555
- output_tensor = torch.empty(
556
- (world_size,) + input_size, dtype=input_.dtype, device=input_.device
557
- )
558
-
559
- output_tensor = funcol.all_gather_tensor(
560
- input_, gather_dim=0, group=self.device_group
561
- ).view((world_size,) + input_size)
562
-
563
- # Reshape
564
- output_tensor = output_tensor.movedim(0, dim)
565
- output_tensor = output_tensor.reshape(
566
- input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
567
- )
568
- return output_tensor
569
-
570
- if reverse:
571
- setattr(GroupCoordinator, "all_gather", vllm_all_gather_backup)
572
- else:
573
- setattr(GroupCoordinator, "all_gather", all_gather)
574
-
575
-
576
531
  def monkey_patch_vllm_gguf_config():
577
- from vllm.model_executor.layers.linear import LinearBase
578
532
  from vllm.model_executor.layers.quantization.gguf import (
579
533
  GGUFConfig,
580
534
  GGUFEmbeddingMethod,
581
535
  GGUFLinearMethod,
582
536
  )
583
537
 
538
+ from sglang.srt.layers.linear import LinearBase
584
539
  from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
585
540
 
586
541
  def get_quant_method_with_embedding_replaced(
@@ -788,7 +743,9 @@ def first_rank_print(*args, **kwargs):
788
743
  pass
789
744
 
790
745
 
791
- def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint: str):
746
+ def get_zmq_socket(
747
+ context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
748
+ ):
792
749
  mem = psutil.virtual_memory()
793
750
  total_mem = mem.total / 1024**3
794
751
  available_mem = mem.available / 1024**3
@@ -801,14 +758,17 @@ def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint:
801
758
  if socket_type == zmq.PUSH:
802
759
  socket.setsockopt(zmq.SNDHWM, 0)
803
760
  socket.setsockopt(zmq.SNDBUF, buf_size)
804
- socket.connect(f"ipc://{endpoint}")
805
761
  elif socket_type == zmq.PULL:
806
762
  socket.setsockopt(zmq.RCVHWM, 0)
807
763
  socket.setsockopt(zmq.RCVBUF, buf_size)
808
- socket.bind(f"ipc://{endpoint}")
809
764
  else:
810
765
  raise ValueError(f"Unsupported socket type: {socket_type}")
811
766
 
767
+ if bind:
768
+ socket.bind(endpoint)
769
+ else:
770
+ socket.connect(endpoint)
771
+
812
772
  return socket
813
773
 
814
774
 
@@ -1250,9 +1210,9 @@ def dataclass_to_string_truncated(data, max_length=2048):
1250
1210
  if isinstance(data, str):
1251
1211
  if len(data) > max_length:
1252
1212
  half_length = max_length // 2
1253
- return f'"{data[:half_length]} ... {data[-half_length:]}"'
1213
+ return f"{repr(data[:half_length])} ... {repr(data[-half_length:])}"
1254
1214
  else:
1255
- return f'"{data}"'
1215
+ return f"{repr(data)}"
1256
1216
  elif isinstance(data, (list, tuple)):
1257
1217
  if len(data) > max_length:
1258
1218
  half_length = max_length // 2
@@ -1263,7 +1223,7 @@ def dataclass_to_string_truncated(data, max_length=2048):
1263
1223
  return (
1264
1224
  "{"
1265
1225
  + ", ".join(
1266
- f"{k}: {dataclass_to_string_truncated(v, max_length)}"
1226
+ f"'{k}': {dataclass_to_string_truncated(v, max_length)}"
1267
1227
  for k, v in data.items()
1268
1228
  )
1269
1229
  + "}"
@@ -1404,3 +1364,81 @@ def nullable_str(val: str):
1404
1364
  if not val or val == "None":
1405
1365
  return None
1406
1366
  return val
1367
+
1368
+
1369
+ def set_uvicorn_logging_configs():
1370
+ LOGGING_CONFIG["formatters"]["default"][
1371
+ "fmt"
1372
+ ] = "[%(asctime)s] %(levelprefix)s %(message)s"
1373
+ LOGGING_CONFIG["formatters"]["default"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
1374
+ LOGGING_CONFIG["formatters"]["access"][
1375
+ "fmt"
1376
+ ] = '[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
1377
+ LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
1378
+
1379
+
1380
+ def get_ip() -> str:
1381
+ # SGLANG_HOST_IP env can be ignore
1382
+ host_ip = os.getenv("SGLANG_HOST_IP", "") or os.getenv("HOST_IP", "")
1383
+ if host_ip:
1384
+ return host_ip
1385
+
1386
+ # IP is not set, try to get it from the network interface
1387
+
1388
+ # try ipv4
1389
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
1390
+ try:
1391
+ s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable
1392
+ return s.getsockname()[0]
1393
+ except Exception:
1394
+ pass
1395
+
1396
+ # try ipv6
1397
+ try:
1398
+ s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
1399
+ # Google's public DNS server, see
1400
+ # https://developers.google.com/speed/public-dns/docs/using#addresses
1401
+ s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
1402
+ return s.getsockname()[0]
1403
+ except Exception:
1404
+ pass
1405
+
1406
+ warnings.warn(
1407
+ "Failed to get the IP address, using 0.0.0.0 by default."
1408
+ "The value can be set by the environment variable"
1409
+ " SGLANG_HOST_IP or HOST_IP.",
1410
+ stacklevel=2,
1411
+ )
1412
+ return "0.0.0.0"
1413
+
1414
+
1415
+ def get_open_port() -> int:
1416
+
1417
+ port = os.getenv("SGLANG_PORT")
1418
+ if port is not None:
1419
+ while True:
1420
+ try:
1421
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1422
+ s.bind(("", port))
1423
+ return port
1424
+ except OSError:
1425
+ port += 1 # Increment port number if already in use
1426
+ logger.info("Port %d is already in use, trying port %d", port - 1, port)
1427
+ # try ipv4
1428
+ try:
1429
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1430
+ s.bind(("", 0))
1431
+ return s.getsockname()[1]
1432
+ except OSError:
1433
+ # try ipv6
1434
+ with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
1435
+ s.bind(("", 0))
1436
+ return s.getsockname()[1]
1437
+
1438
+
1439
+ def is_valid_ipv6_address(address: str) -> bool:
1440
+ try:
1441
+ ipaddress.IPv6Address(address)
1442
+ return True
1443
+ except ValueError:
1444
+ return False
sglang/test/runners.py CHANGED
@@ -12,7 +12,6 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
 
15
- import json
16
15
  import multiprocessing as mp
17
16
  import os
18
17
  from dataclasses import dataclass
@@ -22,8 +21,8 @@ import torch
22
21
  import torch.nn.functional as F
23
22
  from transformers import AutoModelForCausalLM
24
23
 
24
+ from sglang.srt.entrypoints.engine import Engine
25
25
  from sglang.srt.hf_transformers_utils import get_tokenizer
26
- from sglang.srt.server import Runtime
27
26
  from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
28
27
 
29
28
  DEFAULT_PROMPTS = [
@@ -278,7 +277,7 @@ class SRTRunner:
278
277
  ):
279
278
  self.model_type = model_type
280
279
  self.is_generation = model_type == "generation"
281
- self.runtime = Runtime(
280
+ self.engine = Engine(
282
281
  model_path=model_path,
283
282
  tp_size=tp_size,
284
283
  dtype=get_dtype_str(torch_dtype),
@@ -306,7 +305,7 @@ class SRTRunner:
306
305
  top_output_logprobs = []
307
306
  sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
308
307
  for i, prompt in enumerate(prompts):
309
- response = self.runtime.generate(
308
+ response = self.engine.generate(
310
309
  prompt,
311
310
  lora_path=lora_paths[i] if lora_paths else None,
312
311
  sampling_params=sampling_params,
@@ -314,7 +313,6 @@ class SRTRunner:
314
313
  logprob_start_len=0,
315
314
  top_logprobs_num=NUM_TOP_LOGPROBS,
316
315
  )
317
- response = json.loads(response)
318
316
  output_strs.append(response["text"])
319
317
  top_input_logprobs.append(
320
318
  [
@@ -343,8 +341,7 @@ class SRTRunner:
343
341
  top_output_logprobs=top_output_logprobs,
344
342
  )
345
343
  else:
346
- response = self.runtime.encode(prompts)
347
- response = json.loads(response)
344
+ response = self.engine.encode(prompts)
348
345
  if self.model_type == "embedding":
349
346
  logits = [x["embedding"] for x in response]
350
347
  return ModelOutput(embed_logits=logits)
@@ -366,20 +363,18 @@ class SRTRunner:
366
363
  # the return value contains logprobs from prefill
367
364
  output_strs = []
368
365
  sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
369
- response = self.runtime.generate(
366
+ response = self.engine.generate(
370
367
  prompts,
371
368
  lora_path=lora_paths if lora_paths else None,
372
369
  sampling_params=sampling_params,
373
370
  )
374
- response = json.loads(response)
375
371
  output_strs = [r["text"] for r in response]
376
372
 
377
373
  return ModelOutput(
378
374
  output_strs=output_strs,
379
375
  )
380
376
  else:
381
- response = self.runtime.encode(prompts)
382
- response = json.loads(response)
377
+ response = self.engine.encode(prompts)
383
378
  if self.model_type == "embedding":
384
379
  logits = [x["embedding"] for x in response]
385
380
  return ModelOutput(embed_logits=logits)
@@ -391,8 +386,8 @@ class SRTRunner:
391
386
  return self
392
387
 
393
388
  def __exit__(self, exc_type, exc_value, traceback):
394
- self.runtime.shutdown()
395
- del self.runtime
389
+ self.engine.shutdown()
390
+ del self.engine
396
391
 
397
392
 
398
393
  def monkey_patch_gemma2_sdpa():
@@ -535,7 +535,7 @@ def test_hellaswag_select():
535
535
 
536
536
  # Compute accuracy
537
537
  accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
538
- assert np.abs(accuracy_gen - accuracy) < 0.01
538
+ assert np.abs(accuracy_gen - accuracy) < 0.05
539
539
  assert np.abs(latency_gen - latency) < 1
540
540
 
541
541
  return accuracy, latency
sglang/test/test_utils.py CHANGED
@@ -40,6 +40,7 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mis
40
40
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
41
41
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
42
42
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
43
+ DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
43
44
 
44
45
 
45
46
  def is_in_ci():
@@ -405,7 +406,7 @@ def popen_launch_server(
405
406
  base_url: str,
406
407
  timeout: float,
407
408
  api_key: Optional[str] = None,
408
- other_args: tuple = (),
409
+ other_args: list[str] = (),
409
410
  env: Optional[dict] = None,
410
411
  return_stdout_stderr: Optional[tuple] = None,
411
412
  ):
@@ -560,6 +561,7 @@ def run_bench_serving(
560
561
  tokenizer=tokenizer,
561
562
  num_prompts=num_prompts,
562
563
  sharegpt_output_len=None,
564
+ sharegpt_context_len=None,
563
565
  random_input_len=random_input_len,
564
566
  random_output_len=random_output_len,
565
567
  random_range_ratio=0.0,
sglang/utils.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Common utilities"""
2
2
 
3
3
  import base64
4
- import gc
5
4
  import importlib
6
5
  import json
7
6
  import logging
@@ -15,7 +14,7 @@ import urllib.request
15
14
  from concurrent.futures import ThreadPoolExecutor
16
15
  from io import BytesIO
17
16
  from json import dumps
18
- from typing import Optional, Union
17
+ from typing import Any, Callable, List, Optional, Tuple, Type, Union
19
18
 
20
19
  import numpy as np
21
20
  import requests
@@ -363,3 +362,14 @@ def terminate_process(process):
363
362
  def print_highlight(html_content: str):
364
363
  html_content = str(html_content).replace("\n", "<br>")
365
364
  display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
365
+
366
+
367
+ class TypeBasedDispatcher:
368
+ def __init__(self, mapping: List[Tuple[Type, Callable]]):
369
+ self._mapping = mapping
370
+
371
+ def __call__(self, obj: Any):
372
+ for ty, fn in self._mapping:
373
+ if isinstance(obj, ty):
374
+ return fn(obj)
375
+ raise ValueError(f"Invalid object: {obj}")
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.1.post6"
1
+ __version__ = "0.4.1.post7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.1.post6
3
+ Version: 0.4.1.post7
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -236,13 +236,13 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
236
236
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
237
237
  Requires-Dist: uvicorn; extra == "runtime-common"
238
238
  Requires-Dist: uvloop; extra == "runtime-common"
239
- Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
239
+ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
240
240
  Provides-Extra: srt
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
242
  Requires-Dist: cuda-python; extra == "srt"
243
- Requires-Dist: sgl-kernel>=0.0.2.post12; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.2.post14; extra == "srt"
244
244
  Requires-Dist: torch; extra == "srt"
245
- Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
245
+ Requires-Dist: vllm==0.6.4.post1; extra == "srt"
246
246
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
@@ -252,6 +252,9 @@ Provides-Extra: srt-xpu
252
252
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
253
253
  Provides-Extra: srt-hpu
254
254
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
255
+ Provides-Extra: srt-cpu
256
+ Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
257
+ Requires-Dist: torch; extra == "srt-cpu"
255
258
  Provides-Extra: openai
256
259
  Requires-Dist: openai>=1.0; extra == "openai"
257
260
  Requires-Dist: tiktoken; extra == "openai"
@@ -288,6 +291,11 @@ Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
288
291
  Requires-Dist: sglang[openai]; extra == "all-hpu"
289
292
  Requires-Dist: sglang[anthropic]; extra == "all-hpu"
290
293
  Requires-Dist: sglang[litellm]; extra == "all-hpu"
294
+ Provides-Extra: all-cpu
295
+ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
296
+ Requires-Dist: sglang[openai]; extra == "all-cpu"
297
+ Requires-Dist: sglang[anthropic]; extra == "all-cpu"
298
+ Requires-Dist: sglang[litellm]; extra == "all-cpu"
291
299
  Provides-Extra: dev
292
300
  Requires-Dist: sglang[all]; extra == "dev"
293
301
  Requires-Dist: sglang[test]; extra == "dev"
@@ -300,6 +308,9 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
300
308
  Provides-Extra: dev-hpu
301
309
  Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
302
310
  Requires-Dist: sglang[test]; extra == "dev-hpu"
311
+ Provides-Extra: dev-cpu
312
+ Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
313
+ Requires-Dist: sglang[test]; extra == "dev-cpu"
303
314
 
304
315
  <div align="center" id="sglangtop">
305
316
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -361,7 +372,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
361
372
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
362
373
 
363
374
  ## Adoption and Sponsorship
364
- The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
375
+ The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
365
376
 
366
377
  ## Acknowledgment and Citation
367
378
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.