sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. sglang/bench_serving.py +1 -1
  2. sglang/lang/chat_template.py +29 -0
  3. sglang/srt/_custom_ops.py +19 -17
  4. sglang/srt/configs/__init__.py +2 -0
  5. sglang/srt/configs/janus_pro.py +629 -0
  6. sglang/srt/configs/model_config.py +24 -14
  7. sglang/srt/conversation.py +80 -2
  8. sglang/srt/custom_op.py +64 -3
  9. sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
  10. sglang/srt/distributed/parallel_state.py +10 -1
  11. sglang/srt/entrypoints/engine.py +5 -3
  12. sglang/srt/entrypoints/http_server.py +1 -1
  13. sglang/srt/function_call_parser.py +33 -2
  14. sglang/srt/hf_transformers_utils.py +16 -1
  15. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  16. sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
  17. sglang/srt/layers/attention/triton_backend.py +1 -3
  18. sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
  19. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
  20. sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
  21. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
  22. sglang/srt/layers/attention/vision.py +43 -62
  23. sglang/srt/layers/dp_attention.py +30 -2
  24. sglang/srt/layers/elementwise.py +411 -0
  25. sglang/srt/layers/linear.py +1 -1
  26. sglang/srt/layers/logits_processor.py +1 -0
  27. sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
  28. sglang/srt/layers/moe/ep_moe/layer.py +25 -9
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
  37. sglang/srt/layers/moe/router.py +342 -0
  38. sglang/srt/layers/parameter.py +10 -0
  39. sglang/srt/layers/quantization/__init__.py +90 -68
  40. sglang/srt/layers/quantization/blockwise_int8.py +1 -2
  41. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  46. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  48. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  49. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  50. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  51. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  63. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/quantization/fp8.py +174 -106
  68. sglang/srt/layers/quantization/fp8_kernel.py +210 -38
  69. sglang/srt/layers/quantization/fp8_utils.py +156 -15
  70. sglang/srt/layers/quantization/modelopt_quant.py +5 -1
  71. sglang/srt/layers/quantization/w8a8_fp8.py +128 -0
  72. sglang/srt/layers/quantization/w8a8_int8.py +152 -3
  73. sglang/srt/layers/rotary_embedding.py +5 -3
  74. sglang/srt/layers/sampler.py +29 -35
  75. sglang/srt/layers/vocab_parallel_embedding.py +0 -1
  76. sglang/srt/lora/backend/__init__.py +9 -12
  77. sglang/srt/managers/cache_controller.py +74 -8
  78. sglang/srt/managers/data_parallel_controller.py +1 -1
  79. sglang/srt/managers/image_processor.py +37 -631
  80. sglang/srt/managers/image_processors/base_image_processor.py +219 -0
  81. sglang/srt/managers/image_processors/janus_pro.py +79 -0
  82. sglang/srt/managers/image_processors/llava.py +152 -0
  83. sglang/srt/managers/image_processors/minicpmv.py +86 -0
  84. sglang/srt/managers/image_processors/mlama.py +60 -0
  85. sglang/srt/managers/image_processors/qwen_vl.py +161 -0
  86. sglang/srt/managers/io_struct.py +32 -15
  87. sglang/srt/managers/multi_modality_padding.py +134 -0
  88. sglang/srt/managers/schedule_batch.py +213 -118
  89. sglang/srt/managers/schedule_policy.py +40 -8
  90. sglang/srt/managers/scheduler.py +176 -683
  91. sglang/srt/managers/scheduler_output_processor_mixin.py +614 -0
  92. sglang/srt/managers/tokenizer_manager.py +6 -6
  93. sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
  94. sglang/srt/mem_cache/base_prefix_cache.py +6 -8
  95. sglang/srt/mem_cache/chunk_cache.py +12 -44
  96. sglang/srt/mem_cache/hiradix_cache.py +71 -34
  97. sglang/srt/mem_cache/memory_pool.py +81 -17
  98. sglang/srt/mem_cache/paged_allocator.py +283 -0
  99. sglang/srt/mem_cache/radix_cache.py +117 -36
  100. sglang/srt/model_executor/cuda_graph_runner.py +68 -20
  101. sglang/srt/model_executor/forward_batch_info.py +23 -10
  102. sglang/srt/model_executor/model_runner.py +63 -63
  103. sglang/srt/model_loader/loader.py +2 -1
  104. sglang/srt/model_loader/weight_utils.py +1 -1
  105. sglang/srt/models/deepseek_janus_pro.py +2127 -0
  106. sglang/srt/models/deepseek_nextn.py +23 -3
  107. sglang/srt/models/deepseek_v2.py +200 -191
  108. sglang/srt/models/grok.py +374 -119
  109. sglang/srt/models/minicpmv.py +28 -89
  110. sglang/srt/models/mllama.py +1 -1
  111. sglang/srt/models/qwen2.py +0 -1
  112. sglang/srt/models/qwen2_5_vl.py +25 -50
  113. sglang/srt/models/qwen2_vl.py +33 -49
  114. sglang/srt/openai_api/adapter.py +59 -35
  115. sglang/srt/openai_api/protocol.py +8 -1
  116. sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
  117. sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
  118. sglang/srt/server_args.py +24 -16
  119. sglang/srt/speculative/eagle_worker.py +75 -39
  120. sglang/srt/utils.py +104 -9
  121. sglang/test/runners.py +104 -10
  122. sglang/test/test_block_fp8.py +106 -16
  123. sglang/test/test_custom_ops.py +88 -0
  124. sglang/test/test_utils.py +20 -4
  125. sglang/utils.py +0 -4
  126. sglang/version.py +1 -1
  127. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/METADATA +9 -10
  128. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/RECORD +131 -84
  129. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/WHEEL +1 -1
  130. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/LICENSE +0 -0
  131. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/top_level.txt +0 -0
@@ -44,6 +44,7 @@ class SeparatorStyle(IntEnum):
44
44
  CHATGLM3 = auto()
45
45
  DEEPSEEK_CHAT = auto()
46
46
  METAMATH = auto()
47
+ QWEN2_VL_EMBED = auto()
47
48
 
48
49
 
49
50
  @dataclasses.dataclass
@@ -110,6 +111,15 @@ class Conversation:
110
111
  else:
111
112
  ret += role + "\n"
112
113
  return ret
114
+ elif self.sep_style == SeparatorStyle.QWEN2_VL_EMBED:
115
+ ret = "" if system_prompt == "" else system_prompt + self.sep
116
+ for role, message in self.messages:
117
+ if message:
118
+ ret += role + "\n" + message + self.sep
119
+ else:
120
+ ret += role + "\n"
121
+ ret += self.stop_str
122
+ return ret
113
123
  elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
114
124
  ret = system_prompt
115
125
  for role, message in self.messages:
@@ -181,7 +191,7 @@ class Conversation:
181
191
 
182
192
  for i, (role, message) in enumerate(self.messages):
183
193
  if i % 2 == 0:
184
- ret += f"[Round {i//2 + round_add_n}]{self.sep}"
194
+ ret += f"[Round {i // 2 + round_add_n}]{self.sep}"
185
195
 
186
196
  if message:
187
197
  ret += f"{role}:{message}{self.sep}"
@@ -366,6 +376,46 @@ def chat_template_exists(template_name: str) -> bool:
366
376
  return template_name in chat_templates
367
377
 
368
378
 
379
+ def generate_embedding_convs(
380
+ texts: List[str], images: List[str], template_name: str
381
+ ) -> List[Conversation]:
382
+ conv_template = chat_templates[template_name].copy()
383
+ convs = []
384
+ for text, image in zip(texts, images):
385
+ conv = Conversation(
386
+ name=conv_template.name,
387
+ system_template=conv_template.system_template,
388
+ system_message=conv_template.system_message,
389
+ roles=conv_template.roles,
390
+ messages=list(conv_template.messages), # prevent in-place modification
391
+ offset=conv_template.offset,
392
+ sep_style=SeparatorStyle(conv_template.sep_style),
393
+ sep=conv_template.sep,
394
+ sep2=conv_template.sep2,
395
+ stop_str=conv_template.stop_str,
396
+ image_data=[],
397
+ modalities=[],
398
+ image_token=conv_template.image_token,
399
+ )
400
+ real_content = ""
401
+
402
+ if image is not None:
403
+ image_token = (
404
+ conv.image_token + "\n"
405
+ if conv.name != "gme-qwen2-vl"
406
+ else conv.image_token
407
+ )
408
+ real_content += image_token
409
+ if text is not None:
410
+ real_content += text
411
+ conv.append_message(conv.roles[0], real_content)
412
+ # Add a blank message for the assistant.
413
+ conv.append_message(conv.roles[1], None)
414
+ convs.append(conv)
415
+
416
+ return convs
417
+
418
+
369
419
  def generate_chat_conv(
370
420
  request: ChatCompletionRequest, template_name: str
371
421
  ) -> Conversation:
@@ -403,7 +453,6 @@ def generate_chat_conv(
403
453
  conv.system_message = getattr(message.content[0], "text", "")
404
454
  elif msg_role == "user":
405
455
  # Handle the various types of Chat Request content types here.
406
- role = conv.roles[0]
407
456
  if isinstance(message.content, str):
408
457
  conv.append_message(conv.roles[0], message.content)
409
458
  else:
@@ -555,6 +604,20 @@ register_conv_template(
555
604
  )
556
605
  )
557
606
 
607
+ # Reference: https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct#usage
608
+ register_conv_template(
609
+ Conversation(
610
+ name="gme-qwen2-vl",
611
+ system_message="You are a helpful assistant.",
612
+ system_template="<|im_start|>system\n{system_message}",
613
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
614
+ sep="<|im_end|>\n",
615
+ sep_style=SeparatorStyle.QWEN2_VL_EMBED,
616
+ stop_str="<|endoftext|>",
617
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
618
+ )
619
+ )
620
+
558
621
  # Reference: https://huggingface.co/openbmb/MiniCPM-V-2_6#usage
559
622
  register_conv_template(
560
623
  Conversation(
@@ -568,3 +631,18 @@ register_conv_template(
568
631
  image_token="(<image>./</image>)",
569
632
  )
570
633
  )
634
+
635
+ # Reference: https://github.com/deepseek-ai/Janus?tab=readme-ov-file#janus-pro
636
+ register_conv_template(
637
+ Conversation(
638
+ name="janus-pro",
639
+ system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language",
640
+ system_template="{system_message}.",
641
+ roles=("User", "Assistant"),
642
+ sep="\n\n",
643
+ sep2="<|end▁of▁sentence|>",
644
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
645
+ stop_str=["<|User|>", "<|end▁of▁sentence|>"],
646
+ image_token="<image_placeholder>",
647
+ )
648
+ )
sglang/srt/custom_op.py CHANGED
@@ -1,8 +1,12 @@
1
+ from typing import Optional
2
+
1
3
  import torch
2
4
  from torch import nn
3
5
 
4
- _is_cuda = torch.cuda.is_available() and torch.version.cuda
5
- _is_rocm = torch.cuda.is_available() and torch.version.hip
6
+ from sglang.srt.utils import is_cuda, is_hip
7
+
8
+ _is_cuda = is_cuda()
9
+ _is_hip = is_hip()
6
10
 
7
11
 
8
12
  class CustomOp(nn.Module):
@@ -34,7 +38,64 @@ class CustomOp(nn.Module):
34
38
  def dispatch_forward(self):
35
39
  if _is_cuda:
36
40
  return self.forward_cuda
37
- elif _is_rocm:
41
+ elif _is_hip:
38
42
  return self.forward_hip
39
43
  else:
40
44
  return self.forward_native
45
+
46
+
47
+ if _is_cuda:
48
+ from sgl_kernel import sgl_per_tensor_quant_fp8, sgl_per_token_quant_fp8
49
+
50
+ def scaled_fp8_quant(
51
+ input: torch.Tensor,
52
+ scale: Optional[torch.Tensor] = None,
53
+ use_per_token_if_dynamic: bool = False,
54
+ ) -> tuple[torch.Tensor, torch.Tensor]:
55
+ """
56
+ Quantize input tensor to FP8 (8-bit floating point) format.
57
+
58
+ Args:
59
+ input (torch.Tensor): Input tensor to be quantized
60
+ scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
61
+ If None, scales will be computed dynamically.
62
+ use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
63
+ determines the quantization granularity:
64
+ - True: compute scale per token
65
+ - False: compute single scale per tensor
66
+
67
+ Returns:
68
+ Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
69
+ - quantized_tensor: The FP8 quantized version of input
70
+ - scale_tensor: The scaling factors used for quantization
71
+
72
+ Raises:
73
+ AssertionError: If input is not 2D or if static scale's numel != 1
74
+ """
75
+ assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
76
+ shape = input.shape
77
+ out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
78
+ output = torch.empty(shape, device=input.device, dtype=out_dtype)
79
+
80
+ if scale is None:
81
+ # Dynamic scaling
82
+ if use_per_token_if_dynamic:
83
+ scale = torch.empty(
84
+ (shape[0], 1), device=input.device, dtype=torch.float32
85
+ )
86
+ sgl_per_token_quant_fp8(input, output, scale)
87
+ else:
88
+ scale = torch.zeros(1, device=input.device, dtype=torch.float32)
89
+ sgl_per_tensor_quant_fp8(
90
+ input, output, scale, is_static=False
91
+ ) # False for dynamic
92
+ else:
93
+ # Static scaling
94
+ assert (
95
+ scale.numel() == 1
96
+ ), f"Expected scalar scale, got numel={scale.numel()}"
97
+ sgl_per_tensor_quant_fp8(
98
+ input, output, scale, is_static=True
99
+ ) # True for static
100
+
101
+ return output, scale
@@ -22,15 +22,16 @@ from sglang.srt.utils import cuda_device_count_stateless, is_cuda, is_hip
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
25
- is_hip_ = is_hip()
25
+ _is_cuda = is_cuda()
26
+ _is_hip = is_hip()
26
27
 
27
- if is_cuda():
28
+ if _is_cuda:
28
29
  try:
29
30
  import pynvml
30
31
  except ImportError as e:
31
32
  logger.warning("Failed to import pynvml with %r", e)
32
33
 
33
- if is_hip_:
34
+ if _is_hip:
34
35
  try:
35
36
  from amdsmi import (
36
37
  AmdSmiException,
@@ -43,7 +44,7 @@ if is_hip_:
43
44
  logger.warning("Failed to import amdsmi with %r", e)
44
45
 
45
46
  try:
46
- if ops.use_vllm_custom_allreduce and not is_hip_:
47
+ if ops.use_vllm_custom_allreduce and not _is_hip:
47
48
  # Use vLLM custom allreduce
48
49
  ops.meta_size()
49
50
  else:
@@ -63,7 +64,7 @@ _R = TypeVar("_R")
63
64
  def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
64
65
  @wraps(fn)
65
66
  def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
66
- if is_hip_:
67
+ if _is_hip:
67
68
  try:
68
69
  amdsmi_init()
69
70
  return fn(*args, **kwargs)
@@ -81,7 +82,7 @@ def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
81
82
 
82
83
  @with_nvml_context
83
84
  def is_full_nvlink(physical_device_ids: List[int], world_size: int) -> bool:
84
- if is_hip_:
85
+ if _is_hip:
85
86
  """
86
87
  query if the set of gpus are fully connected by xgmi (1 hop)
87
88
  """
@@ -145,7 +146,7 @@ def is_weak_contiguous(inp: torch.Tensor):
145
146
  class CustomAllreduce:
146
147
  _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
147
148
  _MAX_CAR_SIZE = 8192 * 1024
148
- if is_hip_:
149
+ if _is_hip:
149
150
  # crossover is at 16MB buffer size for ROCm
150
151
  _MAX_CAR_SIZE = 2 * 8192 * 1024
151
152
 
@@ -229,7 +230,7 @@ class CustomAllreduce:
229
230
  # test nvlink first, this will filter out most of the cases
230
231
  # where custom allreduce is not supported
231
232
  # this checks hardware and driver support for NVLink
232
- if is_cuda() or is_hip_:
233
+ if _is_cuda or _is_hip:
233
234
  full_nvlink = is_full_nvlink(physical_device_ids, world_size)
234
235
 
235
236
  if world_size > 2 and not full_nvlink:
@@ -243,7 +244,7 @@ class CustomAllreduce:
243
244
  # this is expensive to compute at the first time
244
245
  # then we cache the result
245
246
  # On AMD GPU, p2p is always enabled between XGMI connected GPUs
246
- if not is_hip_ and not _can_p2p(rank, world_size):
247
+ if not _is_hip and not _can_p2p(rank, world_size):
247
248
  logger.warning(
248
249
  "Custom allreduce is disabled because your platform lacks "
249
250
  "GPU P2P capability or P2P test failed. To silence this "
@@ -256,7 +257,7 @@ class CustomAllreduce:
256
257
  self.world_size = world_size
257
258
  self.full_nvlink = full_nvlink
258
259
 
259
- if ops.use_vllm_custom_allreduce and not is_hip_:
260
+ if ops.use_vllm_custom_allreduce and not _is_hip:
260
261
  # Buffers memory are owned by this Python class and passed to C++.
261
262
  # Meta data composes of two parts: meta data for synchronization and a
262
263
  # temporary buffer for storing intermediate allreduce results.
@@ -279,7 +280,7 @@ class CustomAllreduce:
279
280
  )
280
281
  ops.register_buffer(self._ptr, self.buffer_ptrs)
281
282
  else:
282
- if is_hip_:
283
+ if _is_hip:
283
284
  # meta data buffers need to be "uncached" for signal on MI200
284
285
  self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
285
286
  self.buffer = torch.empty(
@@ -418,7 +419,7 @@ class CustomAllreduce:
418
419
  ops.register_buffer(self._ptr, inp, handles, offsets)
419
420
 
420
421
  def register_graph_buffers(self):
421
- if is_hip_:
422
+ if _is_hip:
422
423
  handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
423
424
  handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
424
425
  logger.info("Registering %d cuda graph addresses", len(offset))
@@ -454,12 +455,12 @@ class CustomAllreduce:
454
455
  return False
455
456
  # for 4 or more non NVLink-capable GPUs, custom allreduce provides
456
457
  # little performance improvement over NCCL.
457
- if ops.use_vllm_custom_allreduce and not is_hip_:
458
+ if ops.use_vllm_custom_allreduce and not _is_hip:
458
459
  if self.world_size == 2 or self.full_nvlink:
459
460
  return inp_size < self.max_size
460
461
  return False
461
462
 
462
- if is_hip_:
463
+ if _is_hip:
463
464
  if self.full_nvlink:
464
465
  if self.world_size == 8:
465
466
  if self.MSCCL:
@@ -532,7 +533,7 @@ class CustomAllreduce:
532
533
  return None
533
534
  if self._IS_CAPTURING:
534
535
  if torch.cuda.is_current_stream_capturing():
535
- if is_hip_:
536
+ if _is_hip:
536
537
  return self.all_reduce_reg(input)
537
538
  else:
538
539
  return self.all_reduce(input, registered=True)
@@ -541,7 +542,7 @@ class CustomAllreduce:
541
542
  # allreduce is out-of-place.
542
543
  return torch.empty_like(input)
543
544
  else:
544
- if is_hip_:
545
+ if _is_hip:
545
546
  # note: outside of cuda graph context,
546
547
  # custom allreduce incurs a cost of cudaMemcpy, which should
547
548
  # be small(<=1% of overall latency) compared to the performance
@@ -556,7 +557,7 @@ class CustomAllreduce:
556
557
  if ops.use_vllm_custom_allreduce:
557
558
  self.free_shared_buffer(self.meta_ptrs)
558
559
  self.free_shared_buffer(self.buffer_ptrs)
559
- elif is_cuda():
560
+ elif _is_cuda:
560
561
  self.free_shared_buffer(self.buffer_ptrs)
561
562
  self.free_shared_buffer(self.tmp_result_buffer_ptrs)
562
563
  self.free_shared_buffer(self.barrier_in_ptrs)
@@ -1228,7 +1228,16 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
1228
1228
  ray.shutdown()
1229
1229
  gc.collect()
1230
1230
  if not current_platform.is_cpu():
1231
- torch.cuda.empty_cache()
1231
+ if hasattr(torch, "cuda") and torch.cuda.is_available():
1232
+ torch.cuda.empty_cache()
1233
+ if hasattr(torch._C, "_host_emptyCache"):
1234
+ torch._C._host_emptyCache()
1235
+ else:
1236
+ logger.warning(
1237
+ "torch._C._host_emptyCache() only available in Pytorch >=2.5"
1238
+ )
1239
+ elif hasattr(torch, "xpu") and torch.xpu.is_available():
1240
+ torch.xpu.empty_cache()
1232
1241
 
1233
1242
 
1234
1243
  def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
@@ -106,6 +106,8 @@ class Engine:
106
106
  tokenizer_manager, scheduler_info = _launch_subprocesses(
107
107
  server_args=server_args
108
108
  )
109
+
110
+ self.server_args = server_args
109
111
  self.tokenizer_manager = tokenizer_manager
110
112
  self.scheduler_info = scheduler_info
111
113
 
@@ -214,13 +216,13 @@ class Engine:
214
216
  def encode(
215
217
  self,
216
218
  prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
219
+ image_data: Optional[Union[List[str], str]] = None,
217
220
  ) -> Dict:
218
221
  """
219
222
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
220
223
  Please refer to `EmbeddingReqInput` for the documentation.
221
224
  """
222
-
223
- obj = EmbeddingReqInput(text=prompt)
225
+ obj = EmbeddingReqInput(text=prompt, image_data=image_data)
224
226
  loop = asyncio.get_event_loop()
225
227
  generator = self.tokenizer_manager.generate_request(obj, None)
226
228
  ret = loop.run_until_complete(generator.__anext__())
@@ -374,7 +376,7 @@ def _set_envs_and_config(server_args: ServerArgs):
374
376
  if server_args.attention_backend == "flashinfer":
375
377
  assert_pkg_version(
376
378
  "flashinfer_python",
377
- "0.2.2.post1",
379
+ "0.2.3",
378
380
  "Please uninstall the old version and "
379
381
  "reinstall the latest version by following the instructions "
380
382
  "at https://docs.flashinfer.ai/installation.html.",
@@ -614,7 +614,7 @@ def launch_server(
614
614
 
615
615
  Note:
616
616
  1. The HTTP server, Engine, and TokenizerManager both run in the main process.
617
- 2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
617
+ 2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
618
618
  """
619
619
  tokenizer_manager, scheduler_info = _launch_subprocesses(server_args=server_args)
620
620
  set_global_state(
@@ -318,6 +318,10 @@ class Qwen25Detector(BaseFormatDetector):
318
318
  self.bot_token = "<tool_call>"
319
319
  self.eot_token = "</tool_call>"
320
320
 
321
+ def has_tool_call(self, text: str) -> bool:
322
+ """Check if the text contains a Qwen 2.5 format tool call."""
323
+ return self.bot_token in text
324
+
321
325
  def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
322
326
  """
323
327
  One-time parsing: Detects and parses tool calls in the provided text.
@@ -352,6 +356,10 @@ class MistralDetector(BaseFormatDetector):
352
356
  self.bot_token = "[TOOL_CALLS] ["
353
357
  self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
354
358
 
359
+ def has_tool_call(self, text: str) -> bool:
360
+ """Check if the text contains a Mistral format tool call."""
361
+ return self.bot_token in text
362
+
355
363
  def _clean_text(self, text: str) -> str:
356
364
  """
357
365
  clean text to only leave ''[TOOL_CALLS] [{"name": xxx, "arguments": {xxx}}]'
@@ -397,12 +405,21 @@ class Llama32Detector(BaseFormatDetector):
397
405
  super().__init__()
398
406
  self.bot_token = "<|python_tag|>"
399
407
 
408
+ def has_tool_call(self, text: str) -> bool:
409
+ """Check if the text contains a Llama 3.2 format tool call."""
410
+ # depending on the prompt format the Llama model may or may not
411
+ # prefix the output with the <|python_tag|> token
412
+ return "<|python_tag|>" in text or text.startswith("{")
413
+
400
414
  def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
401
415
  """Parse function calls from text, handling multiple JSON objects."""
402
- if "<|python_tag|>" not in text:
416
+ if "<|python_tag|>" not in text and not text.startswith("{"):
403
417
  return []
404
418
 
405
- _, action_text = text.split("<|python_tag|>")
419
+ if "<|python_tag|>" in text:
420
+ _, action_text = text.split("<|python_tag|>")
421
+ else:
422
+ action_text = text
406
423
 
407
424
  # Split by semicolon and process each part
408
425
  json_parts = [part.strip() for part in action_text.split(";") if part.strip()]
@@ -501,6 +518,20 @@ class FunctionCallParser:
501
518
  self.multi_format_parser = MultiFormatParser(detectors)
502
519
  self.tools = tools
503
520
 
521
+ def has_tool_call(self, text: str) -> bool:
522
+ """
523
+ Check if the given text contains a tool call in the format supported by this parser.
524
+ This delegates to the detector's implementation.
525
+
526
+ :param text: The text to check for tool calls
527
+ :return: True if the text contains a tool call, False otherwise
528
+ """
529
+ # Check all detectors in the multi_format_parser
530
+ for detector in self.multi_format_parser.detectors:
531
+ if detector.has_tool_call(text):
532
+ return True
533
+ return False
534
+
504
535
  def parse_non_stream(self, full_text: str):
505
536
  """
506
537
  Non-streaming call: one-time parsing
@@ -30,13 +30,20 @@ from transformers import (
30
30
  )
31
31
  from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
32
32
 
33
- from sglang.srt.configs import ChatGLMConfig, DbrxConfig, ExaoneConfig, Qwen2_5_VLConfig
33
+ from sglang.srt.configs import (
34
+ ChatGLMConfig,
35
+ DbrxConfig,
36
+ ExaoneConfig,
37
+ MultiModalityConfig,
38
+ Qwen2_5_VLConfig,
39
+ )
34
40
 
35
41
  _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
36
42
  ChatGLMConfig.model_type: ChatGLMConfig,
37
43
  DbrxConfig.model_type: DbrxConfig,
38
44
  ExaoneConfig.model_type: ExaoneConfig,
39
45
  Qwen2_5_VLConfig.model_type: Qwen2_5_VLConfig,
46
+ MultiModalityConfig.model_type: MultiModalityConfig,
40
47
  }
41
48
 
42
49
  for name, cls in _CONFIG_REGISTRY.items():
@@ -66,6 +73,14 @@ def get_config(
66
73
  config = AutoConfig.from_pretrained(
67
74
  model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
68
75
  )
76
+
77
+ # FIXME: Pour contents of janus-pro's langauge_config to first-level
78
+ if isinstance(model, str) and model.lower().startswith("deepseek-ai/janus-pro"):
79
+ assert hasattr(config, "language_config")
80
+ for key, val in config.language_config.__dict__.items():
81
+ setattr(config, key, val)
82
+ setattr(config, "architectures", ["MultiModalityCausalLM"])
83
+
69
84
  if config.model_type in _CONFIG_REGISTRY:
70
85
  config_class = _CONFIG_REGISTRY[config.model_type]
71
86
  config = config_class.from_pretrained(model, revision=revision)
@@ -22,7 +22,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
22
22
  from sglang.srt.layers.dp_attention import get_attention_tp_size
23
23
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
24
24
  from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
25
- from sglang.srt.utils import is_flashinfer_available
25
+ from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
26
26
 
27
27
  if TYPE_CHECKING:
28
28
  from sglang.srt.layers.radix_attention import RadixAttention