sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +6 -0
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +7 -7
- sglang/srt/disaggregation/decode.py +8 -3
- sglang/srt/disaggregation/mooncake/conn.py +43 -25
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/distributed/parallel_state.py +4 -2
- sglang/srt/entrypoints/context.py +3 -20
- sglang/srt/entrypoints/engine.py +13 -8
- sglang/srt/entrypoints/harmony_utils.py +2 -0
- sglang/srt/entrypoints/http_server.py +4 -5
- sglang/srt/entrypoints/openai/protocol.py +0 -9
- sglang/srt/entrypoints/openai/serving_chat.py +59 -265
- sglang/srt/entrypoints/openai/tool_server.py +4 -3
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/jinja_template_utils.py +6 -0
- sglang/srt/layers/attention/aiter_backend.py +370 -107
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +9 -1
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +8 -10
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/moe/cutlass_moe.py +11 -16
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +60 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +4 -1
- sglang/srt/layers/quantization/__init__.py +5 -3
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +22 -10
- sglang/srt/layers/quantization/modelopt_quant.py +6 -11
- sglang/srt/layers/quantization/mxfp4.py +4 -1
- sglang/srt/layers/quantization/w4afp8.py +20 -11
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +281 -2
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +12 -48
- sglang/srt/lora/lora_registry.py +20 -9
- sglang/srt/lora/mem_pool.py +20 -63
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +21 -29
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +6 -6
- sglang/srt/managers/mm_utils.py +1 -2
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +35 -20
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +15 -7
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/tokenizer_manager.py +25 -26
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +34 -24
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +33 -35
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +22 -3
- sglang/srt/model_executor/forward_batch_info.py +26 -5
- sglang/srt/model_executor/model_runner.py +129 -35
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/models/deepseek_v2.py +74 -35
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +8 -9
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +9 -9
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +136 -19
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/reasoning_parser.py +316 -0
- sglang/srt/server_args.py +115 -139
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +12 -4
- sglang/srt/utils.py +3 -3
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +26 -30
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +127 -115
- sglang/lang/backend/__init__.py +0 -0
- sglang/srt/function_call/harmony_tool_parser.py +0 -130
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -1,130 +0,0 @@
|
|
1
|
-
# Copyright 2023-2024 SGLang Team
|
2
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
-
# you may not use this file except in compliance with the License.
|
4
|
-
# You may obtain a copy of the License at
|
5
|
-
#
|
6
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
-
#
|
8
|
-
# Unless required by applicable law or agreed to in writing, software
|
9
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
-
# See the License for the specific language governing permissions and
|
12
|
-
# limitations under the License.
|
13
|
-
# ==============================================================================
|
14
|
-
"""Harmony tool call parser for processing tool calls in harmony models."""
|
15
|
-
|
16
|
-
import uuid
|
17
|
-
from typing import List, Optional, Tuple
|
18
|
-
|
19
|
-
from sglang.srt.entrypoints.openai.protocol import (
|
20
|
-
ChatMessage,
|
21
|
-
FunctionResponse,
|
22
|
-
ToolCall,
|
23
|
-
)
|
24
|
-
|
25
|
-
|
26
|
-
class HarmonyToolCallParser:
|
27
|
-
"""Parser for extracting tool calls from harmony model outputs."""
|
28
|
-
|
29
|
-
def extract_tool_calls_from_message(self, msg) -> Optional[ToolCall]:
|
30
|
-
"""
|
31
|
-
Extract tool call from a single message if it's a tool call.
|
32
|
-
|
33
|
-
Args:
|
34
|
-
msg: The harmony message
|
35
|
-
|
36
|
-
Returns:
|
37
|
-
ToolCall if the message is a tool call, None otherwise
|
38
|
-
"""
|
39
|
-
if (
|
40
|
-
msg.channel == "commentary"
|
41
|
-
and msg.recipient
|
42
|
-
and msg.recipient.startswith("functions.")
|
43
|
-
):
|
44
|
-
function_name = msg.recipient.split(".")[-1]
|
45
|
-
arguments = msg.content[0].text if msg.content else "{}"
|
46
|
-
|
47
|
-
return ToolCall(
|
48
|
-
id=f"call_{uuid.uuid4().hex[:24]}",
|
49
|
-
function=FunctionResponse(
|
50
|
-
name=function_name,
|
51
|
-
arguments=arguments,
|
52
|
-
),
|
53
|
-
)
|
54
|
-
return None
|
55
|
-
|
56
|
-
def process_streaming_chunk(
|
57
|
-
self,
|
58
|
-
harmony_parser,
|
59
|
-
index: int,
|
60
|
-
tool_call_trackers: dict,
|
61
|
-
stream_buffers: dict,
|
62
|
-
) -> Tuple[Optional[dict], bool, Optional[str]]:
|
63
|
-
"""
|
64
|
-
Process a streaming chunk for tool calls.
|
65
|
-
|
66
|
-
Args:
|
67
|
-
harmony_parser: The harmony parser instance
|
68
|
-
index: The choice index
|
69
|
-
tool_call_trackers: Dict tracking tool calls per choice
|
70
|
-
stream_buffers: Dict for buffering content
|
71
|
-
|
72
|
-
Returns:
|
73
|
-
Tuple of (tool_call_data, is_tool_call, delta)
|
74
|
-
"""
|
75
|
-
# Check if we're in a tool call
|
76
|
-
is_tool_call = (
|
77
|
-
harmony_parser.current_channel == "commentary"
|
78
|
-
and harmony_parser.current_recipient
|
79
|
-
and harmony_parser.current_recipient.startswith("functions.")
|
80
|
-
)
|
81
|
-
|
82
|
-
delta = harmony_parser.last_content_delta or ""
|
83
|
-
tool_call_data = None
|
84
|
-
|
85
|
-
if is_tool_call:
|
86
|
-
# Handle tool call streaming
|
87
|
-
function_name = harmony_parser.current_recipient.split(".")[-1]
|
88
|
-
|
89
|
-
# Track tool call indices per choice
|
90
|
-
if index not in tool_call_trackers:
|
91
|
-
tool_call_trackers[index] = {"count": 0, "current_function": None}
|
92
|
-
|
93
|
-
# Check if we just started a new tool call
|
94
|
-
tool_call_tracker = tool_call_trackers[index]
|
95
|
-
if tool_call_tracker["current_function"] != function_name:
|
96
|
-
# New tool call started
|
97
|
-
tool_call_tracker["current_function"] = function_name
|
98
|
-
tool_call_index = tool_call_tracker["count"]
|
99
|
-
tool_call_tracker["count"] += 1
|
100
|
-
|
101
|
-
# Store the tool call index for this function
|
102
|
-
tool_call_key = f"{index}_{function_name}"
|
103
|
-
stream_buffers[tool_call_key] = {
|
104
|
-
"index": tool_call_index,
|
105
|
-
"content": "",
|
106
|
-
}
|
107
|
-
|
108
|
-
tool_call_data = {
|
109
|
-
"id": f"call_{uuid.uuid4().hex[:24]}",
|
110
|
-
"index": tool_call_index,
|
111
|
-
"function_name": function_name,
|
112
|
-
"arguments": delta,
|
113
|
-
"is_first_chunk": True,
|
114
|
-
}
|
115
|
-
else:
|
116
|
-
# Subsequent chunks for the same tool call
|
117
|
-
tool_call_key = f"{index}_{function_name}"
|
118
|
-
tool_call_index = stream_buffers[tool_call_key]["index"]
|
119
|
-
|
120
|
-
tool_call_data = {
|
121
|
-
"id": None,
|
122
|
-
"index": tool_call_index,
|
123
|
-
"function_name": None,
|
124
|
-
"arguments": delta,
|
125
|
-
"is_first_chunk": False,
|
126
|
-
}
|
127
|
-
|
128
|
-
stream_buffers[tool_call_key]["content"] += delta
|
129
|
-
|
130
|
-
return tool_call_data, is_tool_call, delta
|
@@ -1,131 +0,0 @@
|
|
1
|
-
from typing import Tuple
|
2
|
-
|
3
|
-
import torch
|
4
|
-
|
5
|
-
from sglang.srt.lora.backend.base_backend import BaseLoRABackend
|
6
|
-
from sglang.srt.lora.utils import LoRABatchInfo
|
7
|
-
from sglang.srt.utils import is_flashinfer_available
|
8
|
-
|
9
|
-
if is_flashinfer_available():
|
10
|
-
from flashinfer import SegmentGEMMWrapper
|
11
|
-
|
12
|
-
|
13
|
-
class FlashInferLoRABackend(BaseLoRABackend):
|
14
|
-
|
15
|
-
def __init__(self, name: str, batch_info: LoRABatchInfo = None):
|
16
|
-
super().__init__(name, batch_info)
|
17
|
-
|
18
|
-
# Set up SGemm Wrapper from flashinfer
|
19
|
-
# FIXME wait for flashinfer segment gemm update
|
20
|
-
workspace_buffer = torch.empty(1 * 1024 * 1024, dtype=torch.int8, device="cuda")
|
21
|
-
self.segment_gemm = SegmentGEMMWrapper(workspace_buffer)
|
22
|
-
|
23
|
-
def run_lora_a_sgemm(
|
24
|
-
self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
|
25
|
-
) -> torch.Tensor:
|
26
|
-
|
27
|
-
return self.segment_gemm.run(
|
28
|
-
x=x,
|
29
|
-
weights=weights,
|
30
|
-
batch_size=self.batch_info.bs,
|
31
|
-
weight_column_major=True,
|
32
|
-
seg_indptr=self.batch_info.seg_indptr,
|
33
|
-
weight_indices=self.batch_info.weight_indices,
|
34
|
-
)
|
35
|
-
|
36
|
-
def run_lora_b_sgemm(
|
37
|
-
self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
|
38
|
-
) -> torch.Tensor:
|
39
|
-
|
40
|
-
return (
|
41
|
-
self.segment_gemm.run(
|
42
|
-
x=x,
|
43
|
-
weights=weights,
|
44
|
-
batch_size=self.batch_info.bs,
|
45
|
-
weight_column_major=True,
|
46
|
-
seg_indptr=self.batch_info.seg_indptr,
|
47
|
-
weight_indices=self.batch_info.weight_indices,
|
48
|
-
)
|
49
|
-
* self.batch_info.scalings[0]
|
50
|
-
)
|
51
|
-
|
52
|
-
def run_qkv_lora(
|
53
|
-
self,
|
54
|
-
x: torch.Tensor,
|
55
|
-
qkv_lora_a: torch.Tensor,
|
56
|
-
qkv_lora_b: Tuple[torch.Tensor],
|
57
|
-
*args,
|
58
|
-
**kwargs,
|
59
|
-
) -> torch.Tensor:
|
60
|
-
|
61
|
-
assert isinstance(qkv_lora_b, tuple) and len(qkv_lora_b) == 2
|
62
|
-
|
63
|
-
# Shape of lora_a_output: (s, 3 * r)
|
64
|
-
lora_a_output = self.run_lora_a_sgemm(x=x, weights=qkv_lora_a)
|
65
|
-
|
66
|
-
q_lora_b, kv_lora_b = qkv_lora_b
|
67
|
-
lora_rank = kv_lora_b.shape[-1]
|
68
|
-
output_dim_q = q_lora_b.shape[-2]
|
69
|
-
output_dim_kv = kv_lora_b.shape[-2]
|
70
|
-
lora_output = torch.empty(
|
71
|
-
(x.shape[0], output_dim_q + 2 * output_dim_kv),
|
72
|
-
device=x.device,
|
73
|
-
dtype=x.dtype,
|
74
|
-
)
|
75
|
-
|
76
|
-
# q
|
77
|
-
lora_output[:, :output_dim_q] = self.run_lora_b_sgemm(
|
78
|
-
x=lora_a_output[:, :lora_rank].contiguous(), weights=q_lora_b[0]
|
79
|
-
)
|
80
|
-
|
81
|
-
# kv
|
82
|
-
lora_output[:, output_dim_q : output_dim_q + output_dim_kv] = (
|
83
|
-
self.run_lora_b_sgemm(
|
84
|
-
x=lora_a_output[:, lora_rank : 2 * lora_rank].contiguous(),
|
85
|
-
weights=kv_lora_b[0],
|
86
|
-
)
|
87
|
-
)
|
88
|
-
|
89
|
-
lora_output[
|
90
|
-
:, output_dim_q + output_dim_kv : output_dim_q + 2 * output_dim_kv
|
91
|
-
] = self.run_lora_b_sgemm(
|
92
|
-
x=lora_a_output[:, 2 * lora_rank : 3 * lora_rank].contiguous(),
|
93
|
-
weights=kv_lora_b[1],
|
94
|
-
)
|
95
|
-
|
96
|
-
return lora_output * self.batch_info.scalings[0]
|
97
|
-
|
98
|
-
def run_gate_up_lora(
|
99
|
-
self,
|
100
|
-
x: torch.Tensor,
|
101
|
-
gate_up_lora_a: torch.Tensor,
|
102
|
-
gate_up_lora_b: Tuple[torch.Tensor],
|
103
|
-
*args,
|
104
|
-
**kwargs,
|
105
|
-
) -> torch.Tensor:
|
106
|
-
|
107
|
-
assert isinstance(gate_up_lora_b, tuple) and len(gate_up_lora_b) == 2
|
108
|
-
lora_rank = gate_up_lora_b[0].shape[-1]
|
109
|
-
output_dim = gate_up_lora_b[0].shape[-2]
|
110
|
-
|
111
|
-
# Shape of lora_a_output: (s, 2 * r)
|
112
|
-
lora_a_output = self.run_lora_a_sgemm(x=x, weights=gate_up_lora_a)
|
113
|
-
|
114
|
-
lora_output = torch.empty(
|
115
|
-
(x.shape[0], 2 * output_dim),
|
116
|
-
device=x.device,
|
117
|
-
dtype=x.dtype,
|
118
|
-
)
|
119
|
-
|
120
|
-
# Compute lora for gate and up proj respectively
|
121
|
-
lora_output[:, :output_dim] = self.run_lora_b_sgemm(
|
122
|
-
x=lora_a_output[:, :lora_rank].contiguous(),
|
123
|
-
weights=gate_up_lora_b[0],
|
124
|
-
)
|
125
|
-
|
126
|
-
lora_output[:, output_dim:] = self.run_lora_b_sgemm(
|
127
|
-
x=lora_a_output[:, lora_rank:].contiguous(),
|
128
|
-
weights=gate_up_lora_b[1],
|
129
|
-
)
|
130
|
-
|
131
|
-
return lora_output * self.batch_info.scalings[0]
|
/sglang/{api.py → lang/api.py}
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|