sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. sglang/__init__.py +21 -23
  2. sglang/api.py +2 -7
  3. sglang/bench_offline_throughput.py +41 -27
  4. sglang/bench_one_batch.py +60 -4
  5. sglang/bench_one_batch_server.py +1 -1
  6. sglang/bench_serving.py +83 -71
  7. sglang/lang/backend/runtime_endpoint.py +183 -4
  8. sglang/lang/chat_template.py +46 -4
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/_custom_ops.py +80 -42
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/load_config.py +1 -0
  13. sglang/srt/configs/model_config.py +1 -0
  14. sglang/srt/constrained/base_grammar_backend.py +21 -0
  15. sglang/srt/constrained/xgrammar_backend.py +8 -4
  16. sglang/srt/conversation.py +14 -1
  17. sglang/srt/distributed/__init__.py +3 -3
  18. sglang/srt/distributed/communication_op.py +2 -1
  19. sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
  20. sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
  21. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  22. sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
  23. sglang/srt/distributed/device_communicators/pynccl.py +80 -1
  24. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
  25. sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
  26. sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
  27. sglang/srt/distributed/parallel_state.py +1 -1
  28. sglang/srt/distributed/utils.py +2 -1
  29. sglang/srt/entrypoints/engine.py +452 -0
  30. sglang/srt/entrypoints/http_server.py +603 -0
  31. sglang/srt/function_call_parser.py +494 -0
  32. sglang/srt/layers/activation.py +8 -8
  33. sglang/srt/layers/attention/flashinfer_backend.py +10 -9
  34. sglang/srt/layers/attention/triton_backend.py +4 -6
  35. sglang/srt/layers/attention/vision.py +204 -0
  36. sglang/srt/layers/dp_attention.py +71 -0
  37. sglang/srt/layers/layernorm.py +5 -5
  38. sglang/srt/layers/linear.py +65 -14
  39. sglang/srt/layers/logits_processor.py +49 -64
  40. sglang/srt/layers/moe/ep_moe/layer.py +24 -16
  41. sglang/srt/layers/moe/fused_moe_native.py +84 -1
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  43. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
  44. sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
  45. sglang/srt/layers/parameter.py +18 -8
  46. sglang/srt/layers/quantization/__init__.py +20 -23
  47. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  48. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  49. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  50. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  51. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  52. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  53. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  54. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  55. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  56. sglang/srt/layers/quantization/fp8.py +10 -4
  57. sglang/srt/layers/quantization/modelopt_quant.py +1 -2
  58. sglang/srt/layers/quantization/w8a8_int8.py +1 -1
  59. sglang/srt/layers/radix_attention.py +2 -2
  60. sglang/srt/layers/rotary_embedding.py +1184 -31
  61. sglang/srt/layers/sampler.py +64 -6
  62. sglang/srt/layers/torchao_utils.py +12 -6
  63. sglang/srt/layers/vocab_parallel_embedding.py +2 -2
  64. sglang/srt/lora/lora.py +1 -9
  65. sglang/srt/managers/configure_logging.py +3 -0
  66. sglang/srt/managers/data_parallel_controller.py +79 -72
  67. sglang/srt/managers/detokenizer_manager.py +24 -6
  68. sglang/srt/managers/image_processor.py +158 -2
  69. sglang/srt/managers/io_struct.py +57 -3
  70. sglang/srt/managers/schedule_batch.py +78 -45
  71. sglang/srt/managers/schedule_policy.py +26 -12
  72. sglang/srt/managers/scheduler.py +326 -201
  73. sglang/srt/managers/session_controller.py +1 -0
  74. sglang/srt/managers/tokenizer_manager.py +210 -121
  75. sglang/srt/managers/tp_worker.py +6 -4
  76. sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
  77. sglang/srt/managers/utils.py +44 -0
  78. sglang/srt/mem_cache/memory_pool.py +10 -32
  79. sglang/srt/metrics/collector.py +15 -6
  80. sglang/srt/model_executor/cuda_graph_runner.py +26 -30
  81. sglang/srt/model_executor/forward_batch_info.py +5 -7
  82. sglang/srt/model_executor/model_runner.py +44 -19
  83. sglang/srt/model_loader/loader.py +83 -6
  84. sglang/srt/model_loader/weight_utils.py +145 -6
  85. sglang/srt/models/baichuan.py +6 -6
  86. sglang/srt/models/chatglm.py +2 -2
  87. sglang/srt/models/commandr.py +17 -5
  88. sglang/srt/models/dbrx.py +13 -5
  89. sglang/srt/models/deepseek.py +3 -3
  90. sglang/srt/models/deepseek_v2.py +11 -11
  91. sglang/srt/models/exaone.py +2 -2
  92. sglang/srt/models/gemma.py +2 -2
  93. sglang/srt/models/gemma2.py +15 -25
  94. sglang/srt/models/gpt2.py +3 -5
  95. sglang/srt/models/gpt_bigcode.py +1 -1
  96. sglang/srt/models/granite.py +2 -2
  97. sglang/srt/models/grok.py +4 -3
  98. sglang/srt/models/internlm2.py +2 -2
  99. sglang/srt/models/llama.py +7 -5
  100. sglang/srt/models/minicpm.py +2 -2
  101. sglang/srt/models/minicpm3.py +9 -9
  102. sglang/srt/models/minicpmv.py +1238 -0
  103. sglang/srt/models/mixtral.py +3 -3
  104. sglang/srt/models/mixtral_quant.py +3 -3
  105. sglang/srt/models/mllama.py +2 -2
  106. sglang/srt/models/olmo.py +3 -3
  107. sglang/srt/models/olmo2.py +4 -4
  108. sglang/srt/models/olmoe.py +7 -13
  109. sglang/srt/models/phi3_small.py +2 -2
  110. sglang/srt/models/qwen.py +2 -2
  111. sglang/srt/models/qwen2.py +41 -4
  112. sglang/srt/models/qwen2_moe.py +3 -3
  113. sglang/srt/models/qwen2_vl.py +22 -122
  114. sglang/srt/models/stablelm.py +2 -2
  115. sglang/srt/models/torch_native_llama.py +20 -7
  116. sglang/srt/models/xverse.py +6 -6
  117. sglang/srt/models/xverse_moe.py +6 -6
  118. sglang/srt/openai_api/adapter.py +139 -37
  119. sglang/srt/openai_api/protocol.py +7 -4
  120. sglang/srt/sampling/custom_logit_processor.py +38 -0
  121. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
  122. sglang/srt/sampling/sampling_batch_info.py +143 -18
  123. sglang/srt/sampling/sampling_params.py +3 -1
  124. sglang/srt/server.py +4 -1090
  125. sglang/srt/server_args.py +77 -15
  126. sglang/srt/speculative/eagle_utils.py +37 -15
  127. sglang/srt/speculative/eagle_worker.py +11 -13
  128. sglang/srt/utils.py +164 -129
  129. sglang/test/runners.py +8 -13
  130. sglang/test/test_programs.py +2 -1
  131. sglang/test/test_utils.py +83 -22
  132. sglang/utils.py +12 -2
  133. sglang/version.py +1 -1
  134. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
  135. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
  136. sglang/launch_server_llavavid.py +0 -25
  137. sglang/srt/constrained/__init__.py +0 -16
  138. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  139. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
  140. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
  141. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/pynccl.py
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/pynccl.py
2
2
 
3
3
  # This file is a pure Python wrapper for the NCCL library.
4
4
  # The main purpose is to use NCCL combined with CUDA graph.
@@ -57,7 +57,7 @@ def find_nccl_library() -> str:
57
57
  so_file = "librccl.so.1"
58
58
  else:
59
59
  raise ValueError("NCCL only supports CUDA and ROCm backends.")
60
- logger.info("Found nccl from library %s", so_file)
60
+ logger.debug("Found nccl from library %s", so_file)
61
61
  return so_file
62
62
 
63
63
 
@@ -187,6 +187,43 @@ class NCCLLibrary:
187
187
  cudaStream_t,
188
188
  ],
189
189
  ),
190
+ # ncclResult_t ncclAllGather(
191
+ # const void* sendbuff, void* recvbuff, size_t count,
192
+ # ncclDataType_t datatype, ncclComm_t comm,
193
+ # cudaStream_t stream);
194
+ # note that cudaStream_t is a pointer type, so the last argument
195
+ # is a pointer
196
+ Function(
197
+ "ncclAllGather",
198
+ ncclResult_t,
199
+ [
200
+ buffer_type,
201
+ buffer_type,
202
+ ctypes.c_size_t,
203
+ ncclDataType_t,
204
+ ncclComm_t,
205
+ cudaStream_t,
206
+ ],
207
+ ),
208
+ # ncclResult_t ncclReduceScatter(
209
+ # const void* sendbuff, void* recvbuff, size_t count,
210
+ # ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
211
+ # cudaStream_t stream);
212
+ # note that cudaStream_t is a pointer type, so the last argument
213
+ # is a pointer
214
+ Function(
215
+ "ncclReduceScatter",
216
+ ncclResult_t,
217
+ [
218
+ buffer_type,
219
+ buffer_type,
220
+ ctypes.c_size_t,
221
+ ncclDataType_t,
222
+ ncclRedOp_t,
223
+ ncclComm_t,
224
+ cudaStream_t,
225
+ ],
226
+ ),
190
227
  # ncclResult_t ncclSend(
191
228
  # const void* sendbuff, size_t count, ncclDataType_t datatype,
192
229
  # int dest, ncclComm_t comm, cudaStream_t stream);
@@ -217,6 +254,23 @@ class NCCLLibrary:
217
254
  cudaStream_t,
218
255
  ],
219
256
  ),
257
+ # ncclResult_t ncclBroadcast(
258
+ # const void* sendbuff, void* recvbuff, size_t count,
259
+ # ncclDataType_t datatype, int root, ncclComm_t comm,
260
+ # cudaStream_t stream);
261
+ Function(
262
+ "ncclBroadcast",
263
+ ncclResult_t,
264
+ [
265
+ buffer_type,
266
+ buffer_type,
267
+ ctypes.c_size_t,
268
+ ncclDataType_t,
269
+ ctypes.c_int,
270
+ ncclComm_t,
271
+ cudaStream_t,
272
+ ],
273
+ ),
220
274
  # be cautious! this is a collective call, it will block until all
221
275
  # processes in the communicator have called this function.
222
276
  # because Python object destruction can happen in random order,
@@ -321,6 +375,46 @@ class NCCLLibrary:
321
375
  )
322
376
  )
323
377
 
378
+ def ncclReduceScatter(
379
+ self,
380
+ sendbuff: buffer_type,
381
+ recvbuff: buffer_type,
382
+ count: int,
383
+ datatype: int,
384
+ op: int,
385
+ comm: ncclComm_t,
386
+ stream: cudaStream_t,
387
+ ) -> None:
388
+ # `datatype` actually should be `ncclDataType_t`
389
+ # and `op` should be `ncclRedOp_t`
390
+ # both are aliases of `ctypes.c_int`
391
+ # when we pass int to a function, it will be converted to `ctypes.c_int`
392
+ # by ctypes automatically
393
+ self.NCCL_CHECK(
394
+ self._funcs["ncclReduceScatter"](
395
+ sendbuff, recvbuff, count, datatype, op, comm, stream
396
+ )
397
+ )
398
+
399
+ def ncclAllGather(
400
+ self,
401
+ sendbuff: buffer_type,
402
+ recvbuff: buffer_type,
403
+ count: int,
404
+ datatype: int,
405
+ comm: ncclComm_t,
406
+ stream: cudaStream_t,
407
+ ) -> None:
408
+ # `datatype` actually should be `ncclDataType_t`
409
+ # which is an aliases of `ctypes.c_int`
410
+ # when we pass int to a function, it will be converted to `ctypes.c_int`
411
+ # by ctypes automatically
412
+ self.NCCL_CHECK(
413
+ self._funcs["ncclAllGather"](
414
+ sendbuff, recvbuff, count, datatype, comm, stream
415
+ )
416
+ )
417
+
324
418
  def ncclSend(
325
419
  self,
326
420
  sendbuff: buffer_type,
@@ -347,6 +441,22 @@ class NCCLLibrary:
347
441
  self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
348
442
  )
349
443
 
444
+ def ncclBroadcast(
445
+ self,
446
+ sendbuff: buffer_type,
447
+ recvbuff: buffer_type,
448
+ count: int,
449
+ datatype: int,
450
+ root: int,
451
+ comm: ncclComm_t,
452
+ stream: cudaStream_t,
453
+ ) -> None:
454
+ self.NCCL_CHECK(
455
+ self._funcs["ncclBroadcast"](
456
+ sendbuff, recvbuff, count, datatype, root, comm, stream
457
+ )
458
+ )
459
+
350
460
  def ncclCommDestroy(self, comm: ncclComm_t) -> None:
351
461
  self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
352
462
 
@@ -1,11 +1,9 @@
1
- # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/shm_broadcast.py
2
- import ipaddress
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/shm_broadcast.py
2
+
3
3
  import logging
4
4
  import os
5
5
  import pickle
6
- import socket
7
6
  import time
8
- import warnings
9
7
  from contextlib import contextmanager
10
8
  from dataclasses import dataclass, field
11
9
  from multiprocessing import shared_memory
@@ -18,6 +16,8 @@ from torch.distributed import ProcessGroup
18
16
  from zmq import IPV6 # type: ignore
19
17
  from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context # type: ignore
20
18
 
19
+ from sglang.srt.utils import get_ip, get_open_port, is_valid_ipv6_address
20
+
21
21
  # SGLANG_RINGBUFFER_WARNING_INTERVAL can be set to 60
22
22
  SGLANG_RINGBUFFER_WARNING_INTERVAL = int(
23
23
  os.environ.get("SGLANG_RINGBUFFER_WARNING_INTERVAL", "60")
@@ -26,73 +26,6 @@ SGLANG_RINGBUFFER_WARNING_INTERVAL = int(
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
28
 
29
- def get_ip() -> str:
30
- # SGLANG_HOST_IP env can be ignore
31
- host_ip = os.getenv("SGLANG_HOST_IP", "") or os.getenv("HOST_IP", "")
32
- if host_ip:
33
- return host_ip
34
-
35
- # IP is not set, try to get it from the network interface
36
-
37
- # try ipv4
38
- s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
39
- try:
40
- s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable
41
- return s.getsockname()[0]
42
- except Exception:
43
- pass
44
-
45
- # try ipv6
46
- try:
47
- s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
48
- # Google's public DNS server, see
49
- # https://developers.google.com/speed/public-dns/docs/using#addresses
50
- s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
51
- return s.getsockname()[0]
52
- except Exception:
53
- pass
54
-
55
- warnings.warn(
56
- "Failed to get the IP address, using 0.0.0.0 by default."
57
- "The value can be set by the environment variable"
58
- " SGLANG_HOST_IP or HOST_IP.",
59
- stacklevel=2,
60
- )
61
- return "0.0.0.0"
62
-
63
-
64
- def get_open_port() -> int:
65
-
66
- port = os.getenv("SGLANG_PORT")
67
- if port is not None:
68
- while True:
69
- try:
70
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
71
- s.bind(("", port))
72
- return port
73
- except OSError:
74
- port += 1 # Increment port number if already in use
75
- logger.info("Port %d is already in use, trying port %d", port - 1, port)
76
- # try ipv4
77
- try:
78
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
79
- s.bind(("", 0))
80
- return s.getsockname()[1]
81
- except OSError:
82
- # try ipv6
83
- with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
84
- s.bind(("", 0))
85
- return s.getsockname()[1]
86
-
87
-
88
- def is_valid_ipv6_address(address: str) -> bool:
89
- try:
90
- ipaddress.IPv6Address(address)
91
- return True
92
- except ValueError:
93
- return False
94
-
95
-
96
29
  class ShmRingBuffer:
97
30
 
98
31
  def __init__(
@@ -313,7 +246,7 @@ class MessageQueue:
313
246
  remote_subscribe_port=remote_subscribe_port,
314
247
  )
315
248
 
316
- logger.info("vLLM message queue communication handle: %s", self.handle)
249
+ logger.debug("Message queue communication handle: %s", self.handle)
317
250
 
318
251
  def export_handle(self) -> Handle:
319
252
  return self.handle
@@ -1,4 +1,5 @@
1
- # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/xpu_communicator.py
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/xpu_communicator.py
2
+
2
3
  import torch
3
4
  import torch.distributed as dist
4
5
  from torch.distributed import ProcessGroup
@@ -1,4 +1,4 @@
1
- # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/parallel_state.py
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/parallel_state.py
2
2
 
3
3
  # Copyright 2023 The vLLM team.
4
4
  # Adapted from
@@ -1,4 +1,5 @@
1
- # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/utils.py
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/utils.py
2
+
2
3
  # Copyright 2023 The vLLM team.
3
4
  # Adapted from
4
5
  # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py