sglang 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +6 -0
- sglang/bench_one_batch.py +1 -1
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +3 -1
- sglang/check_env.py +3 -4
- sglang/lang/backend/openai.py +18 -5
- sglang/lang/chat_template.py +28 -7
- sglang/lang/interpreter.py +7 -3
- sglang/lang/ir.py +10 -0
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/code_completion_parser.py +174 -0
- sglang/srt/configs/__init__.py +2 -6
- sglang/srt/configs/deepseekvl2.py +667 -0
- sglang/srt/configs/janus_pro.py +3 -4
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +63 -11
- sglang/srt/configs/utils.py +25 -0
- sglang/srt/connector/__init__.py +51 -0
- sglang/srt/connector/base_connector.py +112 -0
- sglang/srt/connector/redis.py +85 -0
- sglang/srt/connector/s3.py +122 -0
- sglang/srt/connector/serde/__init__.py +31 -0
- sglang/srt/connector/serde/safe_serde.py +29 -0
- sglang/srt/connector/serde/serde.py +43 -0
- sglang/srt/connector/utils.py +35 -0
- sglang/srt/conversation.py +88 -0
- sglang/srt/disaggregation/conn.py +81 -0
- sglang/srt/disaggregation/decode.py +495 -0
- sglang/srt/disaggregation/mini_lb.py +285 -0
- sglang/srt/disaggregation/prefill.py +249 -0
- sglang/srt/disaggregation/utils.py +44 -0
- sglang/srt/distributed/parallel_state.py +10 -3
- sglang/srt/entrypoints/engine.py +55 -5
- sglang/srt/entrypoints/http_server.py +71 -12
- sglang/srt/function_call_parser.py +164 -54
- sglang/srt/hf_transformers_utils.py +28 -3
- sglang/srt/layers/activation.py +4 -2
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +295 -0
- sglang/srt/layers/attention/flashinfer_backend.py +1 -1
- sglang/srt/layers/attention/flashmla_backend.py +284 -0
- sglang/srt/layers/attention/triton_backend.py +171 -38
- sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
- sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
- sglang/srt/layers/attention/utils.py +53 -0
- sglang/srt/layers/attention/vision.py +9 -28
- sglang/srt/layers/dp_attention.py +62 -23
- sglang/srt/layers/elementwise.py +411 -0
- sglang/srt/layers/layernorm.py +24 -2
- sglang/srt/layers/linear.py +17 -5
- sglang/srt/layers/logits_processor.py +26 -7
- sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
- sglang/srt/layers/moe/ep_moe/layer.py +273 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
- sglang/srt/layers/moe/fused_moe_native.py +2 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
- sglang/srt/layers/moe/router.py +342 -0
- sglang/srt/layers/moe/topk.py +31 -18
- sglang/srt/layers/parameter.py +1 -1
- sglang/srt/layers/quantization/__init__.py +184 -126
- sglang/srt/layers/quantization/base_config.py +5 -0
- sglang/srt/layers/quantization/blockwise_int8.py +1 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
- sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
- sglang/srt/layers/quantization/fp8.py +76 -34
- sglang/srt/layers/quantization/fp8_kernel.py +24 -8
- sglang/srt/layers/quantization/fp8_utils.py +284 -28
- sglang/srt/layers/quantization/gptq.py +36 -9
- sglang/srt/layers/quantization/kv_cache.py +98 -0
- sglang/srt/layers/quantization/modelopt_quant.py +9 -7
- sglang/srt/layers/quantization/utils.py +153 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
- sglang/srt/layers/rotary_embedding.py +66 -87
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/layers.py +68 -0
- sglang/srt/lora/lora.py +2 -22
- sglang/srt/lora/lora_manager.py +47 -23
- sglang/srt/lora/mem_pool.py +110 -51
- sglang/srt/lora/utils.py +12 -1
- sglang/srt/managers/cache_controller.py +4 -5
- sglang/srt/managers/data_parallel_controller.py +31 -9
- sglang/srt/managers/expert_distribution.py +81 -0
- sglang/srt/managers/io_struct.py +39 -3
- sglang/srt/managers/mm_utils.py +373 -0
- sglang/srt/managers/multimodal_processor.py +68 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
- sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
- sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
- sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
- sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
- sglang/srt/managers/schedule_batch.py +134 -31
- sglang/srt/managers/scheduler.py +325 -38
- sglang/srt/managers/scheduler_output_processor_mixin.py +4 -1
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +59 -23
- sglang/srt/managers/tp_worker.py +1 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
- sglang/srt/managers/utils.py +6 -1
- sglang/srt/mem_cache/hiradix_cache.py +27 -8
- sglang/srt/mem_cache/memory_pool.py +258 -98
- sglang/srt/mem_cache/paged_allocator.py +2 -2
- sglang/srt/mem_cache/radix_cache.py +4 -4
- sglang/srt/model_executor/cuda_graph_runner.py +85 -28
- sglang/srt/model_executor/forward_batch_info.py +81 -15
- sglang/srt/model_executor/model_runner.py +70 -6
- sglang/srt/model_loader/loader.py +160 -2
- sglang/srt/model_loader/weight_utils.py +45 -0
- sglang/srt/models/deepseek_janus_pro.py +29 -86
- sglang/srt/models/deepseek_nextn.py +22 -10
- sglang/srt/models/deepseek_v2.py +326 -192
- sglang/srt/models/deepseek_vl2.py +358 -0
- sglang/srt/models/gemma3_causal.py +684 -0
- sglang/srt/models/gemma3_mm.py +462 -0
- sglang/srt/models/grok.py +374 -119
- sglang/srt/models/llama.py +47 -7
- sglang/srt/models/llama_eagle.py +1 -0
- sglang/srt/models/llama_eagle3.py +196 -0
- sglang/srt/models/llava.py +3 -3
- sglang/srt/models/llavavid.py +3 -3
- sglang/srt/models/minicpmo.py +1995 -0
- sglang/srt/models/minicpmv.py +62 -137
- sglang/srt/models/mllama.py +4 -4
- sglang/srt/models/phi3_small.py +1 -1
- sglang/srt/models/qwen2.py +3 -0
- sglang/srt/models/qwen2_5_vl.py +68 -146
- sglang/srt/models/qwen2_classification.py +75 -0
- sglang/srt/models/qwen2_moe.py +9 -1
- sglang/srt/models/qwen2_vl.py +25 -63
- sglang/srt/openai_api/adapter.py +145 -47
- sglang/srt/openai_api/protocol.py +23 -2
- sglang/srt/sampling/sampling_batch_info.py +1 -1
- sglang/srt/sampling/sampling_params.py +6 -6
- sglang/srt/server_args.py +104 -14
- sglang/srt/speculative/build_eagle_tree.py +7 -347
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
- sglang/srt/speculative/eagle_utils.py +208 -252
- sglang/srt/speculative/eagle_worker.py +139 -53
- sglang/srt/speculative/spec_info.py +6 -1
- sglang/srt/torch_memory_saver_adapter.py +22 -0
- sglang/srt/utils.py +182 -21
- sglang/test/__init__.py +0 -0
- sglang/test/attention/__init__.py +0 -0
- sglang/test/attention/test_flashattn_backend.py +312 -0
- sglang/test/runners.py +2 -0
- sglang/test/test_activation.py +2 -1
- sglang/test/test_block_fp8.py +5 -4
- sglang/test/test_block_fp8_ep.py +2 -1
- sglang/test/test_dynamic_grad_mode.py +58 -0
- sglang/test/test_layernorm.py +3 -2
- sglang/test/test_utils.py +55 -4
- sglang/utils.py +31 -0
- sglang/version.py +1 -1
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +171 -125
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
- sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
- sglang/srt/managers/image_processor.py +0 -55
- sglang/srt/managers/image_processors/base_image_processor.py +0 -219
- sglang/srt/managers/image_processors/minicpmv.py +0 -86
- sglang/srt/managers/multi_modality_padding.py +0 -134
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
sglang/srt/conversation.py
CHANGED
@@ -44,7 +44,9 @@ class SeparatorStyle(IntEnum):
|
|
44
44
|
CHATGLM3 = auto()
|
45
45
|
DEEPSEEK_CHAT = auto()
|
46
46
|
METAMATH = auto()
|
47
|
+
DeepSeekVL2 = auto()
|
47
48
|
QWEN2_VL_EMBED = auto()
|
49
|
+
GEMMA3 = auto()
|
48
50
|
|
49
51
|
|
50
52
|
@dataclasses.dataclass
|
@@ -71,9 +73,13 @@ class Conversation:
|
|
71
73
|
stop_str: Union[str, List[str]] = None
|
72
74
|
# The string that represents an image token in the prompt
|
73
75
|
image_token: str = "<image>"
|
76
|
+
audio_token: str = "<audio>"
|
74
77
|
|
75
78
|
image_data: Optional[List[str]] = None
|
76
79
|
modalities: Optional[List[str]] = None
|
80
|
+
stop_token_ids: Optional[int] = None
|
81
|
+
|
82
|
+
audio_data: Optional[List[str]] = None
|
77
83
|
|
78
84
|
def get_prompt(self) -> str:
|
79
85
|
"""Get the prompt for generation."""
|
@@ -285,6 +291,30 @@ class Conversation:
|
|
285
291
|
else:
|
286
292
|
ret += role + ":"
|
287
293
|
return ret
|
294
|
+
elif self.sep_style == SeparatorStyle.DeepSeekVL2:
|
295
|
+
seps = [self.sep, self.sep2]
|
296
|
+
if system_prompt == "" or system_prompt is None:
|
297
|
+
ret = ""
|
298
|
+
else:
|
299
|
+
ret = system_prompt + seps[0]
|
300
|
+
for i, (role, message) in enumerate(self.messages):
|
301
|
+
if message:
|
302
|
+
ret += role + ": " + message + seps[i % 2]
|
303
|
+
else:
|
304
|
+
ret += role + ":"
|
305
|
+
return ret
|
306
|
+
elif self.sep_style == SeparatorStyle.GEMMA3:
|
307
|
+
ret = system_prompt
|
308
|
+
for i, (role, message) in enumerate(self.messages):
|
309
|
+
if message:
|
310
|
+
if i == 0:
|
311
|
+
ret += message + self.sep
|
312
|
+
else:
|
313
|
+
ret += role + message + self.sep
|
314
|
+
else:
|
315
|
+
ret += role
|
316
|
+
return ret
|
317
|
+
|
288
318
|
else:
|
289
319
|
raise ValueError(f"Invalid style: {self.sep_style}")
|
290
320
|
|
@@ -300,6 +330,10 @@ class Conversation:
|
|
300
330
|
"""Append a new message."""
|
301
331
|
self.image_data.append(image)
|
302
332
|
|
333
|
+
def append_audio(self, audio: str):
|
334
|
+
"""Append a new message."""
|
335
|
+
self.audio_data.append(audio)
|
336
|
+
|
303
337
|
def update_last_message(self, message: str):
|
304
338
|
"""Update the last output.
|
305
339
|
|
@@ -346,6 +380,7 @@ class Conversation:
|
|
346
380
|
sep2=self.sep2,
|
347
381
|
stop_str=self.stop_str,
|
348
382
|
image_token=self.image_token,
|
383
|
+
audio_token=self.audio_token,
|
349
384
|
)
|
350
385
|
|
351
386
|
def dict(self):
|
@@ -432,8 +467,10 @@ def generate_chat_conv(
|
|
432
467
|
sep2=conv.sep2,
|
433
468
|
stop_str=conv.stop_str,
|
434
469
|
image_data=[],
|
470
|
+
audio_data=[],
|
435
471
|
modalities=[],
|
436
472
|
image_token=conv.image_token,
|
473
|
+
audio_token=conv.audio_token,
|
437
474
|
)
|
438
475
|
|
439
476
|
if isinstance(request.messages, str):
|
@@ -471,6 +508,7 @@ def generate_chat_conv(
|
|
471
508
|
if conv.name != "qwen2-vl"
|
472
509
|
else conv.image_token
|
473
510
|
)
|
511
|
+
audio_token = conv.audio_token
|
474
512
|
for content in message.content:
|
475
513
|
if content.type == "text":
|
476
514
|
if num_image_url > 16:
|
@@ -480,6 +518,10 @@ def generate_chat_conv(
|
|
480
518
|
# NOTE: Only works for llava
|
481
519
|
real_content += image_token
|
482
520
|
conv.append_image(content.image_url.url)
|
521
|
+
elif content.type == "audio_url":
|
522
|
+
real_content += audio_token
|
523
|
+
conv.append_audio(content.audio_url.url)
|
524
|
+
|
483
525
|
conv.append_message(conv.roles[0], real_content)
|
484
526
|
elif msg_role == "assistant":
|
485
527
|
parsed_content = ""
|
@@ -604,6 +646,37 @@ register_conv_template(
|
|
604
646
|
)
|
605
647
|
)
|
606
648
|
|
649
|
+
register_conv_template(
|
650
|
+
Conversation(
|
651
|
+
name="deepseek-vl2",
|
652
|
+
system_template="{system_message}",
|
653
|
+
# system_message="You are a helpful assistant. Please answer truthfully and write out your "
|
654
|
+
# "thinking step by step to be sure you get the right answer.",
|
655
|
+
system_message="",
|
656
|
+
roles=("<|User|>", "<|Assistant|>"),
|
657
|
+
messages=(),
|
658
|
+
offset=0,
|
659
|
+
sep_style=SeparatorStyle.DeepSeekVL2,
|
660
|
+
sep="\n\n",
|
661
|
+
sep2="<|end▁of▁sentence|>",
|
662
|
+
stop_str=["User:", "<|end▁of▁sentence|>"],
|
663
|
+
)
|
664
|
+
)
|
665
|
+
|
666
|
+
# Reference: https://huggingface.co/google/gemma-3-4b-it/blob/main/config.json
|
667
|
+
register_conv_template(
|
668
|
+
Conversation(
|
669
|
+
name="gemma-it",
|
670
|
+
system_message="You are a helpful assistant.",
|
671
|
+
system_template="<start_of_turn>user{system_message}\n\n",
|
672
|
+
roles=("<start_of_turn>user\n", "<start_of_turn>model\n"),
|
673
|
+
sep="<end_of_turn>\n",
|
674
|
+
sep_style=SeparatorStyle.GEMMA3,
|
675
|
+
stop_str=["<end_of_turn>"],
|
676
|
+
image_token="<start_of_image>",
|
677
|
+
)
|
678
|
+
)
|
679
|
+
|
607
680
|
# Reference: https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct#usage
|
608
681
|
register_conv_template(
|
609
682
|
Conversation(
|
@@ -646,3 +719,18 @@ register_conv_template(
|
|
646
719
|
image_token="<image_placeholder>",
|
647
720
|
)
|
648
721
|
)
|
722
|
+
|
723
|
+
# Reference: https://huggingface.co/openbmb/MiniCPM-o-2_6#usage
|
724
|
+
register_conv_template(
|
725
|
+
Conversation(
|
726
|
+
name="minicpmo",
|
727
|
+
system_message="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
|
728
|
+
system_template="<|im_start|>system\n{system_message}",
|
729
|
+
roles=("<|im_start|>user", "<|im_start|>assistant"),
|
730
|
+
sep="<|im_end|>\n",
|
731
|
+
sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
|
732
|
+
stop_str=("<|im_end|>", "<|endoftext|>"),
|
733
|
+
image_token="(<image>./</image>)",
|
734
|
+
audio_token="(<audio>./</audio>)",
|
735
|
+
)
|
736
|
+
)
|
@@ -0,0 +1,81 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import numpy.typing as npt
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class KVArgs:
|
14
|
+
engine_rank: int
|
15
|
+
kv_data_ptrs: list[int]
|
16
|
+
kv_data_lens: list[int]
|
17
|
+
kv_item_lens: list[int]
|
18
|
+
aux_data_ptrs: list[int]
|
19
|
+
aux_data_lens: list[int]
|
20
|
+
aux_item_lens: list[int]
|
21
|
+
ib_device: str
|
22
|
+
|
23
|
+
|
24
|
+
class KVManager:
|
25
|
+
def __init__(self, args: KVArgs): ...
|
26
|
+
|
27
|
+
|
28
|
+
class KVPoll:
|
29
|
+
Failed = 0
|
30
|
+
Bootstrapping = 1
|
31
|
+
WaitingForInput = 2
|
32
|
+
Transferring = 3
|
33
|
+
Success = 4
|
34
|
+
|
35
|
+
|
36
|
+
class KVSender:
|
37
|
+
def __init__(self, mgr: KVManager, bootstrap_addr: str, bootstrap_room: int):
|
38
|
+
self.has_sent = False
|
39
|
+
|
40
|
+
def init(self, num_kv_indices: int, aux_index: Optional[int] = None): ...
|
41
|
+
|
42
|
+
def send(self, kv_indices: npt.NDArray[np.int32]):
|
43
|
+
self.has_sent = True
|
44
|
+
|
45
|
+
def poll(self) -> KVPoll:
|
46
|
+
if self.has_sent is False:
|
47
|
+
# Assume handshake completed instantly
|
48
|
+
return KVPoll.WaitingForInput
|
49
|
+
else:
|
50
|
+
# Assume transfer completed instantly
|
51
|
+
return KVPoll.Success
|
52
|
+
|
53
|
+
def failure_exception(self):
|
54
|
+
raise Exception("Fake KVSender Exception")
|
55
|
+
|
56
|
+
|
57
|
+
class KVReceiver:
|
58
|
+
def __init__(
|
59
|
+
self, mgr: KVManager, bootstrap_addr: str, bootstrap_room: Optional[int] = None
|
60
|
+
):
|
61
|
+
self.has_init = False
|
62
|
+
|
63
|
+
def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
|
64
|
+
self.has_init = True
|
65
|
+
|
66
|
+
def poll(self) -> KVPoll:
|
67
|
+
if self.has_init is False:
|
68
|
+
# Assume handshake completed instantly
|
69
|
+
return KVPoll.WaitingForInput
|
70
|
+
else:
|
71
|
+
# Assume transfer completed instantly
|
72
|
+
return KVPoll.Success
|
73
|
+
|
74
|
+
def failure_exception(self):
|
75
|
+
raise Exception("Fake KVReceiver Exception")
|
76
|
+
|
77
|
+
|
78
|
+
class KVBootstrapServer:
|
79
|
+
def __init__(self, port: int): ...
|
80
|
+
|
81
|
+
def poll(self) -> KVPoll: ...
|