sglang 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +6 -0
  3. sglang/bench_one_batch.py +1 -1
  4. sglang/bench_one_batch_server.py +1 -1
  5. sglang/bench_serving.py +3 -1
  6. sglang/check_env.py +3 -4
  7. sglang/lang/backend/openai.py +18 -5
  8. sglang/lang/chat_template.py +28 -7
  9. sglang/lang/interpreter.py +7 -3
  10. sglang/lang/ir.py +10 -0
  11. sglang/srt/_custom_ops.py +1 -1
  12. sglang/srt/code_completion_parser.py +174 -0
  13. sglang/srt/configs/__init__.py +2 -6
  14. sglang/srt/configs/deepseekvl2.py +667 -0
  15. sglang/srt/configs/janus_pro.py +3 -4
  16. sglang/srt/configs/load_config.py +1 -0
  17. sglang/srt/configs/model_config.py +63 -11
  18. sglang/srt/configs/utils.py +25 -0
  19. sglang/srt/connector/__init__.py +51 -0
  20. sglang/srt/connector/base_connector.py +112 -0
  21. sglang/srt/connector/redis.py +85 -0
  22. sglang/srt/connector/s3.py +122 -0
  23. sglang/srt/connector/serde/__init__.py +31 -0
  24. sglang/srt/connector/serde/safe_serde.py +29 -0
  25. sglang/srt/connector/serde/serde.py +43 -0
  26. sglang/srt/connector/utils.py +35 -0
  27. sglang/srt/conversation.py +88 -0
  28. sglang/srt/disaggregation/conn.py +81 -0
  29. sglang/srt/disaggregation/decode.py +495 -0
  30. sglang/srt/disaggregation/mini_lb.py +285 -0
  31. sglang/srt/disaggregation/prefill.py +249 -0
  32. sglang/srt/disaggregation/utils.py +44 -0
  33. sglang/srt/distributed/parallel_state.py +10 -3
  34. sglang/srt/entrypoints/engine.py +55 -5
  35. sglang/srt/entrypoints/http_server.py +71 -12
  36. sglang/srt/function_call_parser.py +164 -54
  37. sglang/srt/hf_transformers_utils.py +28 -3
  38. sglang/srt/layers/activation.py +4 -2
  39. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  40. sglang/srt/layers/attention/flashattention_backend.py +295 -0
  41. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  42. sglang/srt/layers/attention/flashmla_backend.py +284 -0
  43. sglang/srt/layers/attention/triton_backend.py +171 -38
  44. sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  45. sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  46. sglang/srt/layers/attention/utils.py +53 -0
  47. sglang/srt/layers/attention/vision.py +9 -28
  48. sglang/srt/layers/dp_attention.py +62 -23
  49. sglang/srt/layers/elementwise.py +411 -0
  50. sglang/srt/layers/layernorm.py +24 -2
  51. sglang/srt/layers/linear.py +17 -5
  52. sglang/srt/layers/logits_processor.py +26 -7
  53. sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  54. sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  55. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  56. sglang/srt/layers/moe/fused_moe_native.py +2 -1
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  63. sglang/srt/layers/moe/router.py +342 -0
  64. sglang/srt/layers/moe/topk.py +31 -18
  65. sglang/srt/layers/parameter.py +1 -1
  66. sglang/srt/layers/quantization/__init__.py +184 -126
  67. sglang/srt/layers/quantization/base_config.py +5 -0
  68. sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  69. sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  70. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  71. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  72. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  73. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  74. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  75. sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  76. sglang/srt/layers/quantization/fp8.py +76 -34
  77. sglang/srt/layers/quantization/fp8_kernel.py +24 -8
  78. sglang/srt/layers/quantization/fp8_utils.py +284 -28
  79. sglang/srt/layers/quantization/gptq.py +36 -9
  80. sglang/srt/layers/quantization/kv_cache.py +98 -0
  81. sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  82. sglang/srt/layers/quantization/utils.py +153 -0
  83. sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
  84. sglang/srt/layers/rotary_embedding.py +66 -87
  85. sglang/srt/layers/sampler.py +1 -1
  86. sglang/srt/lora/layers.py +68 -0
  87. sglang/srt/lora/lora.py +2 -22
  88. sglang/srt/lora/lora_manager.py +47 -23
  89. sglang/srt/lora/mem_pool.py +110 -51
  90. sglang/srt/lora/utils.py +12 -1
  91. sglang/srt/managers/cache_controller.py +4 -5
  92. sglang/srt/managers/data_parallel_controller.py +31 -9
  93. sglang/srt/managers/expert_distribution.py +81 -0
  94. sglang/srt/managers/io_struct.py +39 -3
  95. sglang/srt/managers/mm_utils.py +373 -0
  96. sglang/srt/managers/multimodal_processor.py +68 -0
  97. sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  98. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  99. sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  100. sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
  101. sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
  102. sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  103. sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
  104. sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
  105. sglang/srt/managers/schedule_batch.py +134 -31
  106. sglang/srt/managers/scheduler.py +325 -38
  107. sglang/srt/managers/scheduler_output_processor_mixin.py +4 -1
  108. sglang/srt/managers/session_controller.py +1 -1
  109. sglang/srt/managers/tokenizer_manager.py +59 -23
  110. sglang/srt/managers/tp_worker.py +1 -1
  111. sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  112. sglang/srt/managers/utils.py +6 -1
  113. sglang/srt/mem_cache/hiradix_cache.py +27 -8
  114. sglang/srt/mem_cache/memory_pool.py +258 -98
  115. sglang/srt/mem_cache/paged_allocator.py +2 -2
  116. sglang/srt/mem_cache/radix_cache.py +4 -4
  117. sglang/srt/model_executor/cuda_graph_runner.py +85 -28
  118. sglang/srt/model_executor/forward_batch_info.py +81 -15
  119. sglang/srt/model_executor/model_runner.py +70 -6
  120. sglang/srt/model_loader/loader.py +160 -2
  121. sglang/srt/model_loader/weight_utils.py +45 -0
  122. sglang/srt/models/deepseek_janus_pro.py +29 -86
  123. sglang/srt/models/deepseek_nextn.py +22 -10
  124. sglang/srt/models/deepseek_v2.py +326 -192
  125. sglang/srt/models/deepseek_vl2.py +358 -0
  126. sglang/srt/models/gemma3_causal.py +684 -0
  127. sglang/srt/models/gemma3_mm.py +462 -0
  128. sglang/srt/models/grok.py +374 -119
  129. sglang/srt/models/llama.py +47 -7
  130. sglang/srt/models/llama_eagle.py +1 -0
  131. sglang/srt/models/llama_eagle3.py +196 -0
  132. sglang/srt/models/llava.py +3 -3
  133. sglang/srt/models/llavavid.py +3 -3
  134. sglang/srt/models/minicpmo.py +1995 -0
  135. sglang/srt/models/minicpmv.py +62 -137
  136. sglang/srt/models/mllama.py +4 -4
  137. sglang/srt/models/phi3_small.py +1 -1
  138. sglang/srt/models/qwen2.py +3 -0
  139. sglang/srt/models/qwen2_5_vl.py +68 -146
  140. sglang/srt/models/qwen2_classification.py +75 -0
  141. sglang/srt/models/qwen2_moe.py +9 -1
  142. sglang/srt/models/qwen2_vl.py +25 -63
  143. sglang/srt/openai_api/adapter.py +145 -47
  144. sglang/srt/openai_api/protocol.py +23 -2
  145. sglang/srt/sampling/sampling_batch_info.py +1 -1
  146. sglang/srt/sampling/sampling_params.py +6 -6
  147. sglang/srt/server_args.py +104 -14
  148. sglang/srt/speculative/build_eagle_tree.py +7 -347
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  150. sglang/srt/speculative/eagle_utils.py +208 -252
  151. sglang/srt/speculative/eagle_worker.py +139 -53
  152. sglang/srt/speculative/spec_info.py +6 -1
  153. sglang/srt/torch_memory_saver_adapter.py +22 -0
  154. sglang/srt/utils.py +182 -21
  155. sglang/test/__init__.py +0 -0
  156. sglang/test/attention/__init__.py +0 -0
  157. sglang/test/attention/test_flashattn_backend.py +312 -0
  158. sglang/test/runners.py +2 -0
  159. sglang/test/test_activation.py +2 -1
  160. sglang/test/test_block_fp8.py +5 -4
  161. sglang/test/test_block_fp8_ep.py +2 -1
  162. sglang/test/test_dynamic_grad_mode.py +58 -0
  163. sglang/test/test_layernorm.py +3 -2
  164. sglang/test/test_utils.py +55 -4
  165. sglang/utils.py +31 -0
  166. sglang/version.py +1 -1
  167. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
  168. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +171 -125
  169. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
  170. sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  171. sglang/srt/managers/image_processor.py +0 -55
  172. sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  173. sglang/srt/managers/image_processors/minicpmv.py +0 -86
  174. sglang/srt/managers/multi_modality_padding.py +0 -134
  175. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
  176. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
@@ -44,7 +44,9 @@ class SeparatorStyle(IntEnum):
44
44
  CHATGLM3 = auto()
45
45
  DEEPSEEK_CHAT = auto()
46
46
  METAMATH = auto()
47
+ DeepSeekVL2 = auto()
47
48
  QWEN2_VL_EMBED = auto()
49
+ GEMMA3 = auto()
48
50
 
49
51
 
50
52
  @dataclasses.dataclass
@@ -71,9 +73,13 @@ class Conversation:
71
73
  stop_str: Union[str, List[str]] = None
72
74
  # The string that represents an image token in the prompt
73
75
  image_token: str = "<image>"
76
+ audio_token: str = "<audio>"
74
77
 
75
78
  image_data: Optional[List[str]] = None
76
79
  modalities: Optional[List[str]] = None
80
+ stop_token_ids: Optional[int] = None
81
+
82
+ audio_data: Optional[List[str]] = None
77
83
 
78
84
  def get_prompt(self) -> str:
79
85
  """Get the prompt for generation."""
@@ -285,6 +291,30 @@ class Conversation:
285
291
  else:
286
292
  ret += role + ":"
287
293
  return ret
294
+ elif self.sep_style == SeparatorStyle.DeepSeekVL2:
295
+ seps = [self.sep, self.sep2]
296
+ if system_prompt == "" or system_prompt is None:
297
+ ret = ""
298
+ else:
299
+ ret = system_prompt + seps[0]
300
+ for i, (role, message) in enumerate(self.messages):
301
+ if message:
302
+ ret += role + ": " + message + seps[i % 2]
303
+ else:
304
+ ret += role + ":"
305
+ return ret
306
+ elif self.sep_style == SeparatorStyle.GEMMA3:
307
+ ret = system_prompt
308
+ for i, (role, message) in enumerate(self.messages):
309
+ if message:
310
+ if i == 0:
311
+ ret += message + self.sep
312
+ else:
313
+ ret += role + message + self.sep
314
+ else:
315
+ ret += role
316
+ return ret
317
+
288
318
  else:
289
319
  raise ValueError(f"Invalid style: {self.sep_style}")
290
320
 
@@ -300,6 +330,10 @@ class Conversation:
300
330
  """Append a new message."""
301
331
  self.image_data.append(image)
302
332
 
333
+ def append_audio(self, audio: str):
334
+ """Append a new message."""
335
+ self.audio_data.append(audio)
336
+
303
337
  def update_last_message(self, message: str):
304
338
  """Update the last output.
305
339
 
@@ -346,6 +380,7 @@ class Conversation:
346
380
  sep2=self.sep2,
347
381
  stop_str=self.stop_str,
348
382
  image_token=self.image_token,
383
+ audio_token=self.audio_token,
349
384
  )
350
385
 
351
386
  def dict(self):
@@ -432,8 +467,10 @@ def generate_chat_conv(
432
467
  sep2=conv.sep2,
433
468
  stop_str=conv.stop_str,
434
469
  image_data=[],
470
+ audio_data=[],
435
471
  modalities=[],
436
472
  image_token=conv.image_token,
473
+ audio_token=conv.audio_token,
437
474
  )
438
475
 
439
476
  if isinstance(request.messages, str):
@@ -471,6 +508,7 @@ def generate_chat_conv(
471
508
  if conv.name != "qwen2-vl"
472
509
  else conv.image_token
473
510
  )
511
+ audio_token = conv.audio_token
474
512
  for content in message.content:
475
513
  if content.type == "text":
476
514
  if num_image_url > 16:
@@ -480,6 +518,10 @@ def generate_chat_conv(
480
518
  # NOTE: Only works for llava
481
519
  real_content += image_token
482
520
  conv.append_image(content.image_url.url)
521
+ elif content.type == "audio_url":
522
+ real_content += audio_token
523
+ conv.append_audio(content.audio_url.url)
524
+
483
525
  conv.append_message(conv.roles[0], real_content)
484
526
  elif msg_role == "assistant":
485
527
  parsed_content = ""
@@ -604,6 +646,37 @@ register_conv_template(
604
646
  )
605
647
  )
606
648
 
649
+ register_conv_template(
650
+ Conversation(
651
+ name="deepseek-vl2",
652
+ system_template="{system_message}",
653
+ # system_message="You are a helpful assistant. Please answer truthfully and write out your "
654
+ # "thinking step by step to be sure you get the right answer.",
655
+ system_message="",
656
+ roles=("<|User|>", "<|Assistant|>"),
657
+ messages=(),
658
+ offset=0,
659
+ sep_style=SeparatorStyle.DeepSeekVL2,
660
+ sep="\n\n",
661
+ sep2="<|end▁of▁sentence|>",
662
+ stop_str=["User:", "<|end▁of▁sentence|>"],
663
+ )
664
+ )
665
+
666
+ # Reference: https://huggingface.co/google/gemma-3-4b-it/blob/main/config.json
667
+ register_conv_template(
668
+ Conversation(
669
+ name="gemma-it",
670
+ system_message="You are a helpful assistant.",
671
+ system_template="<start_of_turn>user{system_message}\n\n",
672
+ roles=("<start_of_turn>user\n", "<start_of_turn>model\n"),
673
+ sep="<end_of_turn>\n",
674
+ sep_style=SeparatorStyle.GEMMA3,
675
+ stop_str=["<end_of_turn>"],
676
+ image_token="<start_of_image>",
677
+ )
678
+ )
679
+
607
680
  # Reference: https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct#usage
608
681
  register_conv_template(
609
682
  Conversation(
@@ -646,3 +719,18 @@ register_conv_template(
646
719
  image_token="<image_placeholder>",
647
720
  )
648
721
  )
722
+
723
+ # Reference: https://huggingface.co/openbmb/MiniCPM-o-2_6#usage
724
+ register_conv_template(
725
+ Conversation(
726
+ name="minicpmo",
727
+ system_message="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
728
+ system_template="<|im_start|>system\n{system_message}",
729
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
730
+ sep="<|im_end|>\n",
731
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
732
+ stop_str=("<|im_end|>", "<|endoftext|>"),
733
+ image_token="(<image>./</image>)",
734
+ audio_token="(<audio>./</audio>)",
735
+ )
736
+ )
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from enum import Enum
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class KVArgs:
14
+ engine_rank: int
15
+ kv_data_ptrs: list[int]
16
+ kv_data_lens: list[int]
17
+ kv_item_lens: list[int]
18
+ aux_data_ptrs: list[int]
19
+ aux_data_lens: list[int]
20
+ aux_item_lens: list[int]
21
+ ib_device: str
22
+
23
+
24
+ class KVManager:
25
+ def __init__(self, args: KVArgs): ...
26
+
27
+
28
+ class KVPoll:
29
+ Failed = 0
30
+ Bootstrapping = 1
31
+ WaitingForInput = 2
32
+ Transferring = 3
33
+ Success = 4
34
+
35
+
36
+ class KVSender:
37
+ def __init__(self, mgr: KVManager, bootstrap_addr: str, bootstrap_room: int):
38
+ self.has_sent = False
39
+
40
+ def init(self, num_kv_indices: int, aux_index: Optional[int] = None): ...
41
+
42
+ def send(self, kv_indices: npt.NDArray[np.int32]):
43
+ self.has_sent = True
44
+
45
+ def poll(self) -> KVPoll:
46
+ if self.has_sent is False:
47
+ # Assume handshake completed instantly
48
+ return KVPoll.WaitingForInput
49
+ else:
50
+ # Assume transfer completed instantly
51
+ return KVPoll.Success
52
+
53
+ def failure_exception(self):
54
+ raise Exception("Fake KVSender Exception")
55
+
56
+
57
+ class KVReceiver:
58
+ def __init__(
59
+ self, mgr: KVManager, bootstrap_addr: str, bootstrap_room: Optional[int] = None
60
+ ):
61
+ self.has_init = False
62
+
63
+ def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
64
+ self.has_init = True
65
+
66
+ def poll(self) -> KVPoll:
67
+ if self.has_init is False:
68
+ # Assume handshake completed instantly
69
+ return KVPoll.WaitingForInput
70
+ else:
71
+ # Assume transfer completed instantly
72
+ return KVPoll.Success
73
+
74
+ def failure_exception(self):
75
+ raise Exception("Fake KVReceiver Exception")
76
+
77
+
78
+ class KVBootstrapServer:
79
+ def __init__(self, port: int): ...
80
+
81
+ def poll(self) -> KVPoll: ...