sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. sglang/bench_serving.py +23 -3
  2. sglang/srt/configs/deepseekvl2.py +10 -1
  3. sglang/srt/configs/model_config.py +5 -16
  4. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  5. sglang/srt/distributed/parallel_state.py +32 -5
  6. sglang/srt/entrypoints/http_server.py +7 -1
  7. sglang/srt/entrypoints/verl_engine.py +2 -0
  8. sglang/srt/function_call_parser.py +0 -1
  9. sglang/srt/layers/attention/flashattention_backend.py +218 -79
  10. sglang/srt/layers/dp_attention.py +12 -1
  11. sglang/srt/layers/moe/topk.py +30 -3
  12. sglang/srt/layers/quantization/__init__.py +134 -165
  13. sglang/srt/layers/quantization/awq.py +200 -0
  14. sglang/srt/layers/quantization/fp8_kernel.py +2 -1
  15. sglang/srt/layers/quantization/gptq.py +30 -40
  16. sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
  17. sglang/srt/layers/rotary_embedding.py +12 -0
  18. sglang/srt/lora/backend/base_backend.py +4 -4
  19. sglang/srt/lora/backend/flashinfer_backend.py +12 -9
  20. sglang/srt/lora/backend/triton_backend.py +5 -8
  21. sglang/srt/lora/layers.py +19 -33
  22. sglang/srt/lora/lora_manager.py +20 -7
  23. sglang/srt/lora/mem_pool.py +12 -6
  24. sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
  25. sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
  26. sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
  27. sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
  28. sglang/srt/lora/utils.py +6 -0
  29. sglang/srt/managers/io_struct.py +4 -2
  30. sglang/srt/managers/multimodal_processors/clip.py +63 -0
  31. sglang/srt/managers/schedule_batch.py +1 -0
  32. sglang/srt/managers/scheduler.py +25 -19
  33. sglang/srt/managers/tokenizer_manager.py +0 -1
  34. sglang/srt/managers/tp_worker.py +3 -0
  35. sglang/srt/model_executor/cuda_graph_runner.py +9 -8
  36. sglang/srt/model_executor/model_runner.py +9 -6
  37. sglang/srt/model_loader/loader.py +11 -1
  38. sglang/srt/model_loader/weight_utils.py +6 -3
  39. sglang/srt/models/clip.py +563 -0
  40. sglang/srt/models/deepseek_janus_pro.py +2 -2
  41. sglang/srt/models/deepseek_v2.py +151 -26
  42. sglang/srt/models/gemma3_causal.py +12 -2
  43. sglang/srt/models/gemma3_mm.py +6 -0
  44. sglang/srt/openai_api/adapter.py +88 -87
  45. sglang/srt/openai_api/protocol.py +10 -5
  46. sglang/srt/patch_torch.py +71 -0
  47. sglang/srt/server_args.py +21 -11
  48. sglang/srt/speculative/eagle_worker.py +1 -1
  49. sglang/srt/utils.py +33 -0
  50. sglang/test/runners.py +27 -2
  51. sglang/test/test_utils.py +1 -1
  52. sglang/version.py +1 -1
  53. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +8 -4
  54. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +57 -53
  55. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +0 -0
  56. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/licenses/LICENSE +0 -0
  57. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ class ModelCard(BaseModel):
28
28
  created: int = Field(default_factory=lambda: int(time.time()))
29
29
  owned_by: str = "sglang"
30
30
  root: Optional[str] = None
31
+ max_model_len: Optional[int] = None
31
32
 
32
33
 
33
34
  class ModelList(BaseModel):
@@ -187,7 +188,7 @@ class CompletionResponseChoice(BaseModel):
187
188
  index: int
188
189
  text: str
189
190
  logprobs: Optional[LogProbs] = None
190
- finish_reason: Optional[str] = None
191
+ finish_reason: Literal["stop", "length", "content_filter"]
191
192
  matched_stop: Union[None, int, str] = None
192
193
 
193
194
 
@@ -204,7 +205,7 @@ class CompletionResponseStreamChoice(BaseModel):
204
205
  index: int
205
206
  text: str
206
207
  logprobs: Optional[LogProbs] = None
207
- finish_reason: Optional[str] = None
208
+ finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
208
209
  matched_stop: Union[None, int, str] = None
209
210
 
210
211
 
@@ -322,7 +323,7 @@ class ChatCompletionRequest(BaseModel):
322
323
  max_tokens: Optional[int] = None
323
324
  n: int = 1
324
325
  presence_penalty: float = 0.0
325
- response_format: Union[ResponseFormat, StructuralTagResponseFormat] = None
326
+ response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
326
327
  seed: Optional[int] = None
327
328
  stop: Optional[Union[str, List[str]]] = None
328
329
  stream: bool = False
@@ -387,7 +388,9 @@ class ChatCompletionResponseChoice(BaseModel):
387
388
  index: int
388
389
  message: ChatMessage
389
390
  logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
390
- finish_reason: str
391
+ finish_reason: Literal[
392
+ "stop", "length", "tool_calls", "content_filter", "function_call"
393
+ ]
391
394
  matched_stop: Union[None, int, str] = None
392
395
 
393
396
 
@@ -411,7 +414,9 @@ class ChatCompletionResponseStreamChoice(BaseModel):
411
414
  index: int
412
415
  delta: DeltaMessage
413
416
  logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
414
- finish_reason: Optional[str] = None
417
+ finish_reason: Optional[
418
+ Literal["stop", "length", "tool_calls", "content_filter", "function_call"]
419
+ ] = None
415
420
  matched_stop: Union[None, int, str] = None
416
421
 
417
422
 
@@ -0,0 +1,71 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ from typing import Callable, Union
15
+
16
+ import torch
17
+ from torch.multiprocessing import reductions
18
+
19
+
20
+ def monkey_patch_torch_reductions():
21
+ """Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed"""
22
+
23
+ if hasattr(reductions, "_reduce_tensor_original"):
24
+ return
25
+
26
+ reductions._reduce_tensor_original = reductions.reduce_tensor
27
+ reductions._rebuild_cuda_tensor_original = reductions.rebuild_cuda_tensor
28
+
29
+ reductions.reduce_tensor = _reduce_tensor_modified
30
+ reductions.rebuild_cuda_tensor = _rebuild_cuda_tensor_modified
31
+
32
+ reductions.init_reductions()
33
+
34
+
35
+ # The signature has not been changed for years, and we will not need this when the next version is released,
36
+ # so it looks safe to use a constant.
37
+ _REDUCE_TENSOR_ARG_DEVICE_INDEX = 6
38
+
39
+
40
+ def _reduce_tensor_modified(*args, **kwargs):
41
+ output_fn, output_args = reductions._reduce_tensor_original(*args, **kwargs)
42
+ output_args = _modify_tuple(
43
+ output_args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_to_uuid
44
+ )
45
+ return output_fn, output_args
46
+
47
+
48
+ def _rebuild_cuda_tensor_modified(*args):
49
+ args = _modify_tuple(args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_from_maybe_uuid)
50
+ return reductions._rebuild_cuda_tensor_original(*args)
51
+
52
+
53
+ def _device_to_uuid(device: int) -> str:
54
+ return str(torch.cuda.get_device_properties(device).uuid)
55
+
56
+
57
+ def _device_from_maybe_uuid(device_maybe_uuid: Union[int, str]) -> int:
58
+ if isinstance(device_maybe_uuid, int):
59
+ return device_maybe_uuid
60
+
61
+ if isinstance(device_maybe_uuid, str):
62
+ for device in range(torch.cuda.device_count()):
63
+ if str(torch.cuda.get_device_properties(device).uuid) == device_maybe_uuid:
64
+ return device
65
+ raise Exception("Invalid device_uuid=" + device_maybe_uuid)
66
+
67
+ raise Exception(f"Unknown type: {device_maybe_uuid=}")
68
+
69
+
70
+ def _modify_tuple(t, index: int, modifier: Callable):
71
+ return *t[:index], modifier(t[index]), *t[index + 1 :]
sglang/srt/server_args.py CHANGED
@@ -24,6 +24,7 @@ from typing import List, Optional
24
24
  from sglang.srt.hf_transformers_utils import check_gguf_file
25
25
  from sglang.srt.reasoning_parser import ReasoningParser
26
26
  from sglang.srt.utils import (
27
+ configure_ipv6,
27
28
  get_amdgpu_memory_capacity,
28
29
  get_device,
29
30
  get_hpu_memory_capacity,
@@ -52,7 +53,7 @@ class ServerArgs:
52
53
  dtype: str = "auto"
53
54
  kv_cache_dtype: str = "auto"
54
55
  quantization: Optional[str] = None
55
- quantization_param_path: nullable_str = None
56
+ quantization_param_path: Optional[str] = None
56
57
  context_length: Optional[int] = None
57
58
  device: Optional[str] = None
58
59
  served_model_name: Optional[str] = None
@@ -140,7 +141,7 @@ class ServerArgs:
140
141
 
141
142
  # Double Sparsity
142
143
  enable_double_sparsity: bool = False
143
- ds_channel_config_path: str = None
144
+ ds_channel_config_path: Optional[str] = None
144
145
  ds_heavy_channel_num: int = 32
145
146
  ds_heavy_token_num: int = 256
146
147
  ds_heavy_channel_type: str = "qk"
@@ -173,7 +174,7 @@ class ServerArgs:
173
174
  enable_memory_saver: bool = False
174
175
  allow_auto_truncate: bool = False
175
176
  enable_custom_logit_processor: bool = False
176
- tool_call_parser: str = None
177
+ tool_call_parser: Optional[str] = None
177
178
  enable_hierarchical_cache: bool = False
178
179
  hicache_ratio: float = 2.0
179
180
  enable_flashinfer_mla: bool = False
@@ -290,12 +291,17 @@ class ServerArgs:
290
291
  logger.warning(
291
292
  f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
292
293
  )
293
- # DeepEP MoE
294
- if self.enable_deepep_moe:
295
- self.ep_size = self.dp_size
296
- logger.info(
297
- f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the data parallel size[{self.dp_size}]."
298
- )
294
+
295
+ self.enable_sp_layernorm = False
296
+ # DeepEP MoE
297
+ if self.enable_deepep_moe:
298
+ self.ep_size = self.tp_size
299
+ self.enable_sp_layernorm = (
300
+ self.dp_size < self.tp_size if self.enable_dp_attention else True
301
+ )
302
+ logger.info(
303
+ f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
304
+ )
299
305
 
300
306
  # Speculative Decoding
301
307
  if self.speculative_algorithm == "NEXTN":
@@ -1200,8 +1206,12 @@ class PortArgs:
1200
1206
  # DP attention. Use TCP + port to handle both single-node and multi-node.
1201
1207
  if server_args.nnodes == 1 and server_args.dist_init_addr is None:
1202
1208
  dist_init_addr = ("127.0.0.1", server_args.port + ZMQ_TCP_PORT_DELTA)
1209
+ elif server_args.dist_init_addr.startswith("["): # ipv6 address
1210
+ port_num, host = configure_ipv6(server_args.dist_init_addr)
1211
+ dist_init_addr = (host, str(port_num))
1203
1212
  else:
1204
1213
  dist_init_addr = server_args.dist_init_addr.split(":")
1214
+
1205
1215
  assert (
1206
1216
  len(dist_init_addr) == 2
1207
1217
  ), "please provide --dist-init-addr as host:port of head node"
@@ -1210,10 +1220,10 @@ class PortArgs:
1210
1220
  port_base = int(dist_init_port) + 1
1211
1221
  if dp_rank is None:
1212
1222
  scheduler_input_port = (
1213
- port_base + 2
1223
+ port_base + 3
1214
1224
  ) # TokenizerManager to DataParallelController
1215
1225
  else:
1216
- scheduler_input_port = port_base + 2 + 1 + dp_rank
1226
+ scheduler_input_port = port_base + 3 + 1 + dp_rank
1217
1227
 
1218
1228
  return PortArgs(
1219
1229
  tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
@@ -586,5 +586,5 @@ def load_token_map(token_map_path: str) -> List[int]:
586
586
  ignore_patterns=["*.bin", "*.safetensors"],
587
587
  )
588
588
  token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
589
- hot_token_id = torch.load(token_map_path)
589
+ hot_token_id = torch.load(token_map_path, weights_only=True)
590
590
  return torch.tensor(hot_token_id, dtype=torch.int32)
sglang/srt/utils.py CHANGED
@@ -1602,6 +1602,7 @@ def get_ip() -> str:
1602
1602
  def get_open_port() -> int:
1603
1603
  port = os.getenv("SGLANG_PORT")
1604
1604
  if port is not None:
1605
+ port = int(port)
1605
1606
  while True:
1606
1607
  try:
1607
1608
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -1630,6 +1631,38 @@ def is_valid_ipv6_address(address: str) -> bool:
1630
1631
  return False
1631
1632
 
1632
1633
 
1634
+ def configure_ipv6(dist_init_addr):
1635
+ addr = dist_init_addr
1636
+ end = addr.find("]")
1637
+ if end == -1:
1638
+ raise ValueError("invalid IPv6 address format: missing ']'")
1639
+
1640
+ host = addr[: end + 1]
1641
+
1642
+ # this only validates the address without brackets: we still need the below checks.
1643
+ # if it's invalid, immediately raise an error so we know it's not formatting issues.
1644
+ if not is_valid_ipv6_address(host[1:end]):
1645
+ raise ValueError(f"invalid IPv6 address: {host}")
1646
+
1647
+ port_str = None
1648
+ if len(addr) > end + 1:
1649
+ if addr[end + 1] == ":":
1650
+ port_str = addr[end + 2 :]
1651
+ else:
1652
+ raise ValueError("received IPv6 address format: expected ':' after ']'")
1653
+
1654
+ if not port_str:
1655
+ raise ValueError(
1656
+ "a port must be specified in IPv6 address (format: [ipv6]:port)"
1657
+ )
1658
+
1659
+ try:
1660
+ port = int(port_str)
1661
+ except ValueError:
1662
+ raise ValueError(f"invalid port in IPv6 address: '{port_str}'")
1663
+ return port, host
1664
+
1665
+
1633
1666
  def rank0_print(msg: str):
1634
1667
  from sglang.srt.distributed import get_tensor_model_parallel_rank
1635
1668
 
sglang/test/runners.py CHANGED
@@ -19,10 +19,16 @@ from typing import List, Optional, Tuple, Union
19
19
 
20
20
  import torch
21
21
  import torch.nn.functional as F
22
- from transformers import AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor
22
+ from transformers import (
23
+ AutoModel,
24
+ AutoModelForCausalLM,
25
+ AutoModelForVision2Seq,
26
+ AutoProcessor,
27
+ )
23
28
 
24
29
  from sglang.srt.hf_transformers_utils import get_tokenizer
25
30
  from sglang.srt.server import Engine
31
+ from sglang.srt.utils import load_image
26
32
  from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
27
33
 
28
34
  DEFAULT_PROMPTS = [
@@ -140,7 +146,6 @@ class HFRunner:
140
146
  def _get_gme_qwen2_vl_embeddings(
141
147
  self, prompts, image_data: Optional[List[str]] = None
142
148
  ):
143
- from sglang.srt.utils import load_image
144
149
 
145
150
  images = None
146
151
  if image_data is not None:
@@ -226,6 +231,9 @@ class HFRunner:
226
231
  low_cpu_mem_usage=True,
227
232
  ).cuda()
228
233
  self.processor = AutoProcessor.from_pretrained(model_path)
234
+ elif "clip" in model_path.lower():
235
+ self.model = AutoModel.from_pretrained(model_path).cuda()
236
+ self.processor = AutoProcessor.from_pretrained(model_path)
229
237
  else:
230
238
  self.model = _get_sentence_transformer_embedding_model(
231
239
  model_path, torch_dtype
@@ -272,6 +280,23 @@ class HFRunner:
272
280
  assert not self.output_str_only
273
281
  if "gme-qwen2-vl" in model_path.lower():
274
282
  logits = self._get_gme_qwen2_vl_embeddings(prompts, image_data)
283
+ elif "clip" in model_path.lower():
284
+ if image_data is not None:
285
+ image = load_image(image_data)
286
+ inputs = self.processor(
287
+ images=image[0], return_tensors="pt"
288
+ )
289
+ logits = self.model.get_image_features(
290
+ pixel_values=inputs.data["pixel_values"].cuda(),
291
+ ).tolist()
292
+ else:
293
+ inputs = self.tokenizer(
294
+ prompts, padding=True, return_tensors="pt"
295
+ )
296
+ logits = self.model.get_text_features(
297
+ input_ids=inputs.data["input_ids"].cuda(),
298
+ attention_mask=inputs.data["attention_mask"].cuda(),
299
+ ).tolist()
275
300
  else:
276
301
  logits = self.model.encode(prompts).tolist()
277
302
  out_queue.put(ModelOutput(embed_logits=logits))
sglang/test/test_utils.py CHANGED
@@ -29,7 +29,7 @@ from sglang.srt.utils import get_bool_env_var, kill_process_tree
29
29
  from sglang.test.run_eval import run_eval
30
30
  from sglang.utils import get_exception_traceback
31
31
 
32
- DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
32
+ DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
33
33
  DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
34
34
  DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
35
35
  "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.4.post2"
1
+ __version__ = "0.4.4.post3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.4.post2
3
+ Version: 0.4.4.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Requires-Dist: numpy
218
218
  Requires-Dist: IPython
219
219
  Requires-Dist: setproctitle
220
220
  Provides-Extra: runtime-common
221
+ Requires-Dist: compressed-tensors; extra == "runtime-common"
221
222
  Requires-Dist: datasets; extra == "runtime-common"
222
223
  Requires-Dist: decord; extra == "runtime-common"
223
224
  Requires-Dist: fastapi; extra == "runtime-common"
@@ -240,14 +241,17 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
240
241
  Requires-Dist: transformers==4.50.0; extra == "runtime-common"
241
242
  Requires-Dist: uvicorn; extra == "runtime-common"
242
243
  Requires-Dist: uvloop; extra == "runtime-common"
243
- Requires-Dist: xgrammar==0.1.16; extra == "runtime-common"
244
+ Requires-Dist: compressed-tensors; extra == "runtime-common"
245
+ Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
244
246
  Provides-Extra: srt
245
247
  Requires-Dist: sglang[runtime_common]; extra == "srt"
246
- Requires-Dist: sgl-kernel==0.0.5.post3; extra == "srt"
248
+ Requires-Dist: sgl-kernel==0.0.5.post4; extra == "srt"
247
249
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
248
250
  Requires-Dist: torch==2.5.1; extra == "srt"
249
251
  Requires-Dist: cuda-python; extra == "srt"
250
252
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
253
+ Requires-Dist: partial_json_parser; extra == "srt"
254
+ Requires-Dist: einops; extra == "srt"
251
255
  Provides-Extra: srt-hip
252
256
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
253
257
  Requires-Dist: torch; extra == "srt-hip"
@@ -271,7 +275,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
271
275
  Provides-Extra: litellm
272
276
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
273
277
  Provides-Extra: torch-memory-saver
274
- Requires-Dist: torch_memory_saver>=0.0.3; extra == "torch-memory-saver"
278
+ Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
275
279
  Provides-Extra: test
276
280
  Requires-Dist: jsonlines; extra == "test"
277
281
  Requires-Dist: matplotlib; extra == "test"