sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -7
- sglang/bench_one_batch_server.py +7 -2
- sglang/bench_serving.py +3 -3
- sglang/eval/llama3_eval.py +0 -1
- sglang/srt/configs/model_config.py +25 -9
- sglang/srt/configs/update_config.py +40 -5
- sglang/srt/constrained/xgrammar_backend.py +23 -11
- sglang/srt/conversation.py +2 -15
- sglang/srt/disaggregation/ascend/conn.py +1 -3
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +1 -2
- sglang/srt/disaggregation/launch_lb.py +7 -1
- sglang/srt/disaggregation/mini_lb.py +11 -5
- sglang/srt/disaggregation/mooncake/conn.py +141 -47
- sglang/srt/disaggregation/prefill.py +261 -5
- sglang/srt/disaggregation/utils.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/device_communicators/pynccl.py +68 -18
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
- sglang/srt/distributed/naive_distributed.py +112 -0
- sglang/srt/distributed/parallel_state.py +90 -4
- sglang/srt/entrypoints/context.py +20 -1
- sglang/srt/entrypoints/engine.py +29 -4
- sglang/srt/entrypoints/http_server.py +76 -0
- sglang/srt/entrypoints/openai/protocol.py +4 -2
- sglang/srt/entrypoints/openai/serving_chat.py +23 -6
- sglang/srt/entrypoints/openai/serving_completions.py +10 -1
- sglang/srt/entrypoints/openai/serving_responses.py +2 -2
- sglang/srt/eplb/expert_distribution.py +2 -3
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +24 -0
- sglang/srt/host_shared_memory.py +83 -0
- sglang/srt/layers/attention/ascend_backend.py +132 -22
- sglang/srt/layers/attention/flashattention_backend.py +24 -17
- sglang/srt/layers/attention/flashinfer_backend.py +14 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
- sglang/srt/layers/attention/triton_backend.py +109 -73
- sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
- sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
- sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
- sglang/srt/layers/attention/utils.py +94 -15
- sglang/srt/layers/attention/vision.py +40 -13
- sglang/srt/layers/attention/vision_utils.py +65 -0
- sglang/srt/layers/communicator.py +58 -10
- sglang/srt/layers/dp_attention.py +137 -27
- sglang/srt/layers/elementwise.py +94 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
- sglang/srt/layers/layernorm.py +8 -1
- sglang/srt/layers/linear.py +24 -0
- sglang/srt/layers/logits_processor.py +16 -18
- sglang/srt/layers/moe/__init__.py +31 -0
- sglang/srt/layers/moe/ep_moe/layer.py +37 -33
- sglang/srt/layers/moe/fused_moe_native.py +14 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
- sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
- sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
- sglang/srt/layers/moe/moe_runner/base.py +13 -0
- sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
- sglang/srt/layers/moe/router.py +15 -9
- sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
- sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
- sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
- sglang/srt/layers/moe/topk.py +167 -83
- sglang/srt/layers/moe/utils.py +159 -18
- sglang/srt/layers/multimodal.py +156 -40
- sglang/srt/layers/quantization/__init__.py +18 -46
- sglang/srt/layers/quantization/awq.py +22 -23
- sglang/srt/layers/quantization/base_config.py +2 -6
- sglang/srt/layers/quantization/blockwise_int8.py +4 -12
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
- sglang/srt/layers/quantization/fp8.py +127 -119
- sglang/srt/layers/quantization/fp8_kernel.py +195 -24
- sglang/srt/layers/quantization/fp8_utils.py +34 -9
- sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
- sglang/srt/layers/quantization/gptq.py +17 -21
- sglang/srt/layers/quantization/marlin_utils.py +26 -8
- sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
- sglang/srt/layers/quantization/modelopt_quant.py +217 -98
- sglang/srt/layers/quantization/moe_wna16.py +10 -15
- sglang/srt/layers/quantization/mxfp4.py +222 -39
- sglang/srt/layers/quantization/quark/quark.py +390 -0
- sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
- sglang/srt/layers/quantization/unquant.py +34 -70
- sglang/srt/layers/quantization/utils.py +77 -2
- sglang/srt/layers/quantization/w4afp8.py +7 -8
- sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
- sglang/srt/layers/quantization/w8a8_int8.py +5 -13
- sglang/srt/layers/radix_attention.py +6 -0
- sglang/srt/layers/rotary_embedding.py +1 -0
- sglang/srt/layers/sampler.py +5 -2
- sglang/srt/lora/layers.py +6 -2
- sglang/srt/lora/lora_manager.py +21 -22
- sglang/srt/lora/lora_registry.py +3 -3
- sglang/srt/lora/mem_pool.py +26 -24
- sglang/srt/lora/utils.py +10 -12
- sglang/srt/managers/cache_controller.py +80 -19
- sglang/srt/managers/detokenizer_manager.py +10 -2
- sglang/srt/managers/io_struct.py +23 -0
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/schedule_batch.py +22 -48
- sglang/srt/managers/scheduler.py +28 -20
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/template_manager.py +7 -5
- sglang/srt/managers/tokenizer_manager.py +88 -39
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/managers/utils.py +59 -1
- sglang/srt/mem_cache/allocator.py +10 -157
- sglang/srt/mem_cache/allocator_ascend.py +147 -0
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +14 -4
- sglang/srt/mem_cache/memory_pool.py +3 -3
- sglang/srt/mem_cache/memory_pool_host.py +35 -2
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
- sglang/srt/model_executor/cuda_graph_runner.py +33 -33
- sglang/srt/model_executor/forward_batch_info.py +11 -10
- sglang/srt/model_executor/model_runner.py +93 -78
- sglang/srt/model_executor/npu_graph_runner.py +94 -0
- sglang/srt/model_loader/loader.py +24 -6
- sglang/srt/models/dbrx.py +12 -6
- sglang/srt/models/deepseek.py +2 -1
- sglang/srt/models/deepseek_nextn.py +5 -2
- sglang/srt/models/deepseek_v2.py +226 -223
- sglang/srt/models/ernie4.py +2 -2
- sglang/srt/models/glm4_moe.py +27 -65
- sglang/srt/models/glm4_moe_nextn.py +2 -1
- sglang/srt/models/glm4v.py +52 -1
- sglang/srt/models/glm4v_moe.py +8 -11
- sglang/srt/models/gpt_oss.py +41 -76
- sglang/srt/models/granitemoe.py +0 -1
- sglang/srt/models/grok.py +376 -48
- sglang/srt/models/interns1.py +12 -47
- sglang/srt/models/internvl.py +6 -51
- sglang/srt/models/llama.py +10 -2
- sglang/srt/models/llama4.py +18 -7
- sglang/srt/models/minicpm3.py +0 -1
- sglang/srt/models/mixtral.py +0 -2
- sglang/srt/models/nemotron_nas.py +435 -0
- sglang/srt/models/olmoe.py +0 -1
- sglang/srt/models/phi4mm.py +3 -21
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +2 -0
- sglang/srt/models/qwen2_moe.py +23 -23
- sglang/srt/models/qwen3.py +2 -2
- sglang/srt/models/qwen3_classification.py +84 -0
- sglang/srt/models/qwen3_moe.py +27 -43
- sglang/srt/models/step3_vl.py +8 -3
- sglang/srt/models/xverse_moe.py +11 -5
- sglang/srt/multimodal/processors/base_processor.py +3 -3
- sglang/srt/multimodal/processors/internvl.py +7 -2
- sglang/srt/multimodal/processors/llava.py +11 -7
- sglang/srt/offloader.py +433 -0
- sglang/srt/operations.py +22 -2
- sglang/srt/reasoning_parser.py +4 -3
- sglang/srt/sampling/sampling_batch_info.py +7 -4
- sglang/srt/server_args.py +264 -105
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_utils.py +36 -13
- sglang/srt/speculative/eagle_worker.py +56 -3
- sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
- sglang/srt/two_batch_overlap.py +20 -19
- sglang/srt/utils.py +68 -70
- sglang/test/runners.py +8 -5
- sglang/test/test_block_fp8.py +5 -6
- sglang/test/test_block_fp8_ep.py +13 -19
- sglang/test/test_cutlass_moe.py +4 -6
- sglang/test/test_cutlass_w4a8_moe.py +4 -3
- sglang/test/test_fp4_moe.py +4 -3
- sglang/test/test_marlin_moe.py +1 -1
- sglang/test/test_marlin_utils.py +1 -1
- sglang/test/test_utils.py +7 -0
- sglang/utils.py +0 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
- sglang/srt/layers/quantization/fp4.py +0 -557
- sglang/srt/layers/quantization/scalar_type.py +0 -352
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py
CHANGED
@@ -61,7 +61,6 @@ from sglang.srt.configs.model_config import ModelConfig
|
|
61
61
|
from sglang.srt.distributed.parallel_state import destroy_distributed_environment
|
62
62
|
from sglang.srt.entrypoints.engine import _set_envs_and_config
|
63
63
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
64
|
-
from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
|
65
64
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
66
65
|
from sglang.srt.managers.scheduler import Scheduler
|
67
66
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
@@ -267,7 +266,6 @@ def extend(reqs, model_runner):
|
|
267
266
|
model_config=model_runner.model_config,
|
268
267
|
enable_overlap=False,
|
269
268
|
spec_algorithm=SpeculativeAlgorithm.NONE,
|
270
|
-
enable_custom_logit_processor=False,
|
271
269
|
)
|
272
270
|
batch.prepare_for_extend()
|
273
271
|
_maybe_prepare_mlp_sync_batch(batch, model_runner)
|
@@ -301,11 +299,6 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
|
|
301
299
|
disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
|
302
300
|
spec_algorithm=SpeculativeAlgorithm.NONE,
|
303
301
|
speculative_num_draft_tokens=None,
|
304
|
-
enable_two_batch_overlap=model_runner.server_args.enable_two_batch_overlap,
|
305
|
-
enable_deepep_moe=MoeA2ABackend(
|
306
|
-
model_runner.server_args.moe_a2a_backend
|
307
|
-
).is_deepep(),
|
308
|
-
deepep_mode=DeepEPMode(model_runner.server_args.deepep_mode),
|
309
302
|
require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
|
310
303
|
disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
|
311
304
|
)
|
sglang/bench_one_batch_server.py
CHANGED
@@ -26,7 +26,7 @@ from sglang.bench_serving import get_tokenizer, sample_random_requests
|
|
26
26
|
from sglang.profiler import run_profile
|
27
27
|
from sglang.srt.entrypoints.http_server import launch_server
|
28
28
|
from sglang.srt.server_args import ServerArgs
|
29
|
-
from sglang.srt.utils import kill_process_tree
|
29
|
+
from sglang.srt.utils import is_blackwell, kill_process_tree
|
30
30
|
from sglang.test.test_utils import is_in_ci, write_github_step_summary
|
31
31
|
|
32
32
|
|
@@ -363,7 +363,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
363
363
|
acc_length,
|
364
364
|
trace_link,
|
365
365
|
) in result:
|
366
|
-
|
366
|
+
if is_blackwell():
|
367
|
+
hourly_cost_per_gpu = 4 # $4/hour for one B200
|
368
|
+
else:
|
369
|
+
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
370
|
+
|
371
|
+
hourly_cost = hourly_cost_per_gpu * server_args.tp_size
|
367
372
|
input_util = 0.7
|
368
373
|
accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
|
369
374
|
line = (
|
sglang/bench_serving.py
CHANGED
@@ -864,11 +864,11 @@ def sample_mmmu_requests(
|
|
864
864
|
if image.mode == "RGBA":
|
865
865
|
image = image.convert("RGB")
|
866
866
|
|
867
|
-
# Encode image to base64
|
867
|
+
# Encode image to base64 (save as PNG to support palette/alpha modes)
|
868
868
|
buffered = io.BytesIO()
|
869
|
-
image.save(buffered, format="
|
869
|
+
image.save(buffered, format="PNG")
|
870
870
|
img_str = pybase64.b64encode(buffered.getvalue()).decode("utf-8")
|
871
|
-
image_data = f"data:image/
|
871
|
+
image_data = f"data:image/png;base64,{img_str}"
|
872
872
|
else:
|
873
873
|
continue
|
874
874
|
|
sglang/eval/llama3_eval.py
CHANGED
@@ -32,6 +32,7 @@ from sglang.srt.hf_transformers_utils import (
|
|
32
32
|
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
33
33
|
from sglang.srt.server_args import ServerArgs
|
34
34
|
from sglang.srt.utils import get_bool_env_var, is_hip
|
35
|
+
from sglang.utils import is_in_ci
|
35
36
|
|
36
37
|
logger = logging.getLogger(__name__)
|
37
38
|
|
@@ -166,19 +167,20 @@ class ModelConfig:
|
|
166
167
|
derived_context_len = get_context_length(self.hf_text_config)
|
167
168
|
if context_length is not None:
|
168
169
|
if context_length > derived_context_len:
|
169
|
-
if
|
170
|
-
|
170
|
+
reason = "Target model's" if is_draft_model else "User-specified"
|
171
|
+
msg = (
|
172
|
+
f"Warning: {reason} context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
|
173
|
+
f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
|
174
|
+
)
|
175
|
+
if (
|
176
|
+
get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN")
|
177
|
+
or is_in_ci() # FIXME: fix this special case
|
171
178
|
):
|
172
|
-
logger.warning(
|
173
|
-
f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
|
174
|
-
f"This may lead to incorrect model outputs or CUDA errors."
|
175
|
-
)
|
179
|
+
logger.warning(msg)
|
176
180
|
self.context_len = context_length
|
177
181
|
else:
|
178
182
|
raise ValueError(
|
179
|
-
f"
|
180
|
-
f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config. "
|
181
|
-
f"To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
|
183
|
+
f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
|
182
184
|
)
|
183
185
|
else:
|
184
186
|
self.context_len = context_length
|
@@ -341,6 +343,19 @@ class ModelConfig:
|
|
341
343
|
"kv_n_heads",
|
342
344
|
self.hf_config.num_attention_heads,
|
343
345
|
)
|
346
|
+
if self.hf_config.model_type in ["nemotron-nas"]:
|
347
|
+
nkvh = {
|
348
|
+
self.hf_config.num_attention_heads // block.attention.n_heads_in_group
|
349
|
+
for block in self.hf_config.block_configs
|
350
|
+
if not block.attention.no_op
|
351
|
+
}
|
352
|
+
if len(nkvh) == 0:
|
353
|
+
raise RuntimeError("Couldn't determine number of kv heads")
|
354
|
+
if len(nkvh) > 1:
|
355
|
+
raise ValueError(
|
356
|
+
"Variable GQA (VGQA) is not yet supported for nemotron-nas in sglang"
|
357
|
+
)
|
358
|
+
return next(iter(nkvh))
|
344
359
|
|
345
360
|
attributes = [
|
346
361
|
# For Falcon:
|
@@ -642,6 +657,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
|
|
642
657
|
or "InternLM2ForRewardModel" in model_architectures
|
643
658
|
or "Qwen2ForRewardModel" in model_architectures
|
644
659
|
or "Qwen2ForSequenceClassification" in model_architectures
|
660
|
+
or "Qwen3ForSequenceClassification" in model_architectures
|
645
661
|
or "CLIPModel" in model_architectures
|
646
662
|
or "BertModel" in model_architectures
|
647
663
|
or "Contriever" in model_architectures
|
@@ -49,14 +49,25 @@ def get_num_heads_padding_size(tp_size, weight_block_size):
|
|
49
49
|
|
50
50
|
|
51
51
|
def update_intermediate_size(model_config, attr_name, intermediate_padding_size):
|
52
|
-
|
52
|
+
attr_value = intermediate_padding_size
|
53
|
+
if hasattr(model_config, "hf_config") and hasattr(
|
54
|
+
model_config.hf_config, attr_name
|
55
|
+
):
|
53
56
|
attr_value = getattr(model_config.hf_config, attr_name)
|
54
|
-
|
55
|
-
|
57
|
+
elif hasattr(model_config, attr_name):
|
58
|
+
attr_value = getattr(model_config, attr_name)
|
59
|
+
|
60
|
+
if attr_value % intermediate_padding_size != 0:
|
61
|
+
from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
|
56
62
|
|
57
|
-
|
63
|
+
attr_value = pad_vocab_size(attr_value, intermediate_padding_size)
|
64
|
+
if hasattr(model_config, "hf_config"):
|
58
65
|
setattr(model_config.hf_config, attr_name, attr_value)
|
59
|
-
|
66
|
+
if hasattr(model_config, "hf_text_config"):
|
67
|
+
setattr(model_config.hf_text_config, attr_name, attr_value)
|
68
|
+
else:
|
69
|
+
setattr(model_config, attr_name, attr_value)
|
70
|
+
|
60
71
|
return model_config
|
61
72
|
|
62
73
|
|
@@ -118,4 +129,28 @@ def adjust_config_with_unaligned_cpu_tp(
|
|
118
129
|
model_config = update_intermediate_size(
|
119
130
|
model_config, "intermediate_size_mlp", intermediate_padding_size
|
120
131
|
)
|
132
|
+
if (
|
133
|
+
hasattr(model_config.hf_config, "vision_config")
|
134
|
+
and model_config.hf_config.vision_config.model_type == "siglip_vision_model"
|
135
|
+
):
|
136
|
+
model_config.hf_config.vision_config.original_num_attention_heads = (
|
137
|
+
model_config.num_attention_heads
|
138
|
+
)
|
139
|
+
if model_config.hf_config.vision_config.num_attention_heads % tp_size != 0:
|
140
|
+
model_config.hf_config.vision_config.head_dim = (
|
141
|
+
model_config.hf_config.vision_config.hidden_size
|
142
|
+
// model_config.hf_config.vision_config.num_attention_heads
|
143
|
+
)
|
144
|
+
from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
|
145
|
+
|
146
|
+
pad_size = get_num_heads_padding_size(tp_size, weight_block_size)
|
147
|
+
model_config.hf_config.vision_config.num_attention_heads = pad_vocab_size(
|
148
|
+
model_config.hf_config.vision_config.num_attention_heads, pad_size
|
149
|
+
)
|
150
|
+
model_config.hf_config.vision_config = update_intermediate_size(
|
151
|
+
model_config.hf_config.vision_config,
|
152
|
+
"intermediate_size",
|
153
|
+
intermediate_padding_size,
|
154
|
+
)
|
155
|
+
|
121
156
|
return model_config
|
@@ -32,10 +32,15 @@ from sglang.srt.constrained.base_grammar_backend import (
|
|
32
32
|
BaseGrammarBackend,
|
33
33
|
BaseGrammarObject,
|
34
34
|
)
|
35
|
-
from sglang.srt.
|
36
|
-
|
37
|
-
)
|
38
|
-
|
35
|
+
from sglang.srt.utils import is_hip
|
36
|
+
|
37
|
+
_is_hip = is_hip()
|
38
|
+
if _is_hip:
|
39
|
+
from sgl_kernel import apply_token_bitmask_inplace_cuda
|
40
|
+
else:
|
41
|
+
from sglang.srt.constrained.triton_ops.bitmask_ops import (
|
42
|
+
apply_token_bitmask_inplace_triton,
|
43
|
+
)
|
39
44
|
logger = logging.getLogger(__name__)
|
40
45
|
|
41
46
|
|
@@ -94,7 +99,10 @@ class XGrammarGrammar(BaseGrammarObject):
|
|
94
99
|
|
95
100
|
def apply_vocab_mask(self, logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
|
96
101
|
if logits.device.type == "cuda":
|
97
|
-
|
102
|
+
if _is_hip:
|
103
|
+
apply_token_bitmask_inplace_cuda(logits, vocab_mask)
|
104
|
+
else:
|
105
|
+
apply_token_bitmask_inplace_triton(logits, vocab_mask)
|
98
106
|
elif logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
|
99
107
|
self.apply_vocab_mask_cpu(logits, vocab_mask)
|
100
108
|
else:
|
@@ -154,12 +162,16 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
|
|
154
162
|
):
|
155
163
|
super().__init__()
|
156
164
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
165
|
+
if hasattr(tokenizer, "init_xgrammar"):
|
166
|
+
# For special tokenizer
|
167
|
+
tokenizer_info, override_stop_tokens = tokenizer.init_xgrammar()
|
168
|
+
else:
|
169
|
+
# Create TokenizerInfo with model's EOS tokens as the authoritative stop tokens
|
170
|
+
# This ensures consistency between what the model considers EOS and what XGrammar uses
|
171
|
+
tokenizer_info = TokenizerInfo.from_huggingface(
|
172
|
+
tokenizer, vocab_size=vocab_size, stop_token_ids=model_eos_token_ids
|
173
|
+
)
|
174
|
+
override_stop_tokens = None
|
163
175
|
|
164
176
|
self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
|
165
177
|
self.vocab_size = vocab_size
|
sglang/srt/conversation.py
CHANGED
@@ -625,7 +625,7 @@ def generate_chat_conv(
|
|
625
625
|
real_content += content.text
|
626
626
|
elif content.type == "image_url":
|
627
627
|
# NOTE: works for llava and intervl2_5
|
628
|
-
if conv.name in ["internvl-2-5"
|
628
|
+
if conv.name in ["internvl-2-5"]:
|
629
629
|
real_content = image_token + real_content
|
630
630
|
else:
|
631
631
|
real_content += image_token
|
@@ -817,20 +817,7 @@ register_conv_template(
|
|
817
817
|
sep_style=SeparatorStyle.MPT,
|
818
818
|
sep="<|im_end|>\n",
|
819
819
|
stop_str=["<|im_end|>", "<|action_end|>"],
|
820
|
-
image_token="<
|
821
|
-
)
|
822
|
-
)
|
823
|
-
|
824
|
-
register_conv_template(
|
825
|
-
Conversation(
|
826
|
-
name="interns1",
|
827
|
-
system_template="<|im_start|>system\n{system_message}",
|
828
|
-
system_message="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
|
829
|
-
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
830
|
-
sep_style=SeparatorStyle.MPT,
|
831
|
-
sep="<|im_end|>\n",
|
832
|
-
stop_str=["<|im_end|>", "<|action_end|>"],
|
833
|
-
image_token="<image>",
|
820
|
+
image_token="<IMG_CONTEXT>",
|
834
821
|
)
|
835
822
|
)
|
836
823
|
|
@@ -23,9 +23,7 @@ class AscendKVManager(MooncakeKVManager):
|
|
23
23
|
)
|
24
24
|
|
25
25
|
def register_buffer_to_engine(self):
|
26
|
-
self.engine.
|
27
|
-
self.kv_args.kv_data_ptrs[0], sum(self.kv_args.kv_data_lens)
|
28
|
-
)
|
26
|
+
self.engine.batch_register(self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens)
|
29
27
|
# The Ascend backend optimize batch registration for small memory blocks.
|
30
28
|
self.engine.batch_register(
|
31
29
|
self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
|
@@ -259,7 +259,7 @@ class DecodePreallocQueue:
|
|
259
259
|
if len(req.origin_input_ids) > self.max_total_num_tokens:
|
260
260
|
message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}"
|
261
261
|
logger.error(message)
|
262
|
-
prepare_abort(req, message)
|
262
|
+
prepare_abort(req, message, status_code=HTTPStatus.BAD_REQUEST)
|
263
263
|
self.scheduler.stream_output([req], req.return_logprob)
|
264
264
|
return True
|
265
265
|
return False
|
@@ -864,7 +864,6 @@ class SchedulerDisaggregationDecodeMixin:
|
|
864
864
|
self.model_config,
|
865
865
|
self.enable_overlap,
|
866
866
|
self.spec_algorithm,
|
867
|
-
self.server_args.enable_custom_logit_processor,
|
868
867
|
)
|
869
868
|
|
870
869
|
# construct fake completed prefill
|
@@ -118,7 +118,13 @@ def main():
|
|
118
118
|
lb_args = LBArgs.from_cli_args(args)
|
119
119
|
|
120
120
|
prefill_configs = [PrefillConfig(url, port) for url, port in lb_args.prefill_infos]
|
121
|
-
run(
|
121
|
+
run(
|
122
|
+
prefill_configs,
|
123
|
+
lb_args.decode_infos,
|
124
|
+
lb_args.host,
|
125
|
+
lb_args.port,
|
126
|
+
lb_args.timeout,
|
127
|
+
)
|
122
128
|
|
123
129
|
|
124
130
|
if __name__ == "__main__":
|
@@ -50,10 +50,16 @@ class PrefillConfig:
|
|
50
50
|
|
51
51
|
|
52
52
|
class MiniLoadBalancer:
|
53
|
-
def __init__(
|
53
|
+
def __init__(
|
54
|
+
self,
|
55
|
+
prefill_configs: List[PrefillConfig],
|
56
|
+
decode_servers: List[str],
|
57
|
+
timeout: int,
|
58
|
+
):
|
54
59
|
self.prefill_configs = prefill_configs
|
55
60
|
self.prefill_servers = [p.url for p in prefill_configs]
|
56
61
|
self.decode_servers = decode_servers
|
62
|
+
self.timeout = timeout
|
57
63
|
|
58
64
|
def add_prefill_server(self, new_prefill_config: PrefillConfig):
|
59
65
|
self.prefill_configs.append(new_prefill_config)
|
@@ -78,7 +84,7 @@ class MiniLoadBalancer:
|
|
78
84
|
|
79
85
|
async with aiohttp.ClientSession(
|
80
86
|
timeout=aiohttp.ClientTimeout(
|
81
|
-
total=
|
87
|
+
total=self.timeout
|
82
88
|
) # Add timeout for request reliability
|
83
89
|
) as session:
|
84
90
|
tasks = [
|
@@ -117,7 +123,7 @@ class MiniLoadBalancer:
|
|
117
123
|
async def stream_results():
|
118
124
|
async with aiohttp.ClientSession(
|
119
125
|
timeout=aiohttp.ClientTimeout(
|
120
|
-
total=
|
126
|
+
total=self.timeout
|
121
127
|
) # Add timeout for request reliability
|
122
128
|
) as session:
|
123
129
|
# Create the tasks for both prefill and decode requests
|
@@ -401,9 +407,9 @@ async def register(obj: PDRegistryRequest):
|
|
401
407
|
return Response(status_code=200)
|
402
408
|
|
403
409
|
|
404
|
-
def run(prefill_configs, decode_addrs, host, port):
|
410
|
+
def run(prefill_configs, decode_addrs, host, port, timeout):
|
405
411
|
global load_balancer
|
406
|
-
load_balancer = MiniLoadBalancer(prefill_configs, decode_addrs)
|
412
|
+
load_balancer = MiniLoadBalancer(prefill_configs, decode_addrs, timeout=timeout)
|
407
413
|
uvicorn.run(app, host=host, port=port)
|
408
414
|
|
409
415
|
|