sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +72 -10
- sglang/srt/_custom_ops.py +59 -92
- sglang/srt/configs/deepseekvl2.py +10 -1
- sglang/srt/configs/model_config.py +6 -16
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/custom_op.py +5 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/parallel_state.py +32 -5
- sglang/srt/entrypoints/engine.py +0 -5
- sglang/srt/entrypoints/http_server.py +7 -1
- sglang/srt/entrypoints/verl_engine.py +2 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/attention/flashattention_backend.py +582 -125
- sglang/srt/layers/attention/flashinfer_backend.py +5 -7
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
- sglang/srt/layers/attention/flashmla_backend.py +1 -1
- sglang/srt/layers/dp_attention.py +12 -1
- sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
- sglang/srt/layers/moe/ep_moe/layer.py +79 -80
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
- sglang/srt/layers/moe/topk.py +79 -6
- sglang/srt/layers/quantization/__init__.py +137 -165
- sglang/srt/layers/quantization/awq.py +200 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
- sglang/srt/layers/quantization/fp8_kernel.py +2 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -4
- sglang/srt/layers/quantization/gptq.py +30 -40
- sglang/srt/layers/quantization/moe_wna16.py +501 -0
- sglang/srt/layers/quantization/utils.py +1 -1
- sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
- sglang/srt/lora/backend/base_backend.py +4 -4
- sglang/srt/lora/backend/flashinfer_backend.py +12 -9
- sglang/srt/lora/backend/triton_backend.py +5 -8
- sglang/srt/lora/layers.py +19 -33
- sglang/srt/lora/lora_manager.py +20 -7
- sglang/srt/lora/mem_pool.py +12 -6
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
- sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
- sglang/srt/lora/utils.py +6 -0
- sglang/srt/managers/cache_controller.py +34 -11
- sglang/srt/managers/io_struct.py +4 -2
- sglang/srt/managers/mm_utils.py +202 -156
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
- sglang/srt/managers/multimodal_processors/clip.py +44 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
- sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
- sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
- sglang/srt/managers/multimodal_processors/llava.py +34 -14
- sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
- sglang/srt/managers/multimodal_processors/mlama.py +10 -23
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
- sglang/srt/managers/schedule_batch.py +185 -127
- sglang/srt/managers/scheduler.py +29 -23
- sglang/srt/managers/tokenizer_manager.py +1 -2
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/utils.py +1 -6
- sglang/srt/mem_cache/hiradix_cache.py +62 -52
- sglang/srt/mem_cache/memory_pool.py +72 -6
- sglang/srt/mem_cache/paged_allocator.py +39 -0
- sglang/srt/metrics/collector.py +23 -53
- sglang/srt/model_executor/cuda_graph_runner.py +16 -13
- sglang/srt/model_executor/forward_batch_info.py +10 -10
- sglang/srt/model_executor/model_runner.py +64 -59
- sglang/srt/model_loader/loader.py +19 -1
- sglang/srt/model_loader/weight_utils.py +6 -3
- sglang/srt/models/clip.py +568 -0
- sglang/srt/models/deepseek_janus_pro.py +12 -17
- sglang/srt/models/deepseek_v2.py +339 -123
- sglang/srt/models/deepseek_vl2.py +105 -104
- sglang/srt/models/gemma3_causal.py +12 -2
- sglang/srt/models/gemma3_mm.py +20 -80
- sglang/srt/models/llama.py +4 -1
- sglang/srt/models/llava.py +31 -19
- sglang/srt/models/llavavid.py +16 -7
- sglang/srt/models/minicpmo.py +63 -147
- sglang/srt/models/minicpmv.py +17 -27
- sglang/srt/models/mllama.py +29 -14
- sglang/srt/models/qwen2.py +9 -6
- sglang/srt/models/qwen2_5_vl.py +21 -31
- sglang/srt/models/qwen2_vl.py +20 -21
- sglang/srt/openai_api/adapter.py +106 -93
- sglang/srt/openai_api/protocol.py +10 -5
- sglang/srt/patch_torch.py +71 -0
- sglang/srt/platforms/interface.py +371 -0
- sglang/srt/server_args.py +120 -25
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
- sglang/srt/speculative/eagle_utils.py +140 -28
- sglang/srt/speculative/eagle_worker.py +94 -25
- sglang/srt/utils.py +137 -51
- sglang/test/runners.py +27 -2
- sglang/test/test_custom_ops.py +55 -0
- sglang/test/test_utils.py +14 -27
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0
@@ -11,11 +11,11 @@ import tempfile
|
|
11
11
|
from itertools import product
|
12
12
|
from typing import Dict, List, Optional, Sequence
|
13
13
|
|
14
|
+
import torch
|
14
15
|
import torch.distributed as dist
|
15
16
|
import torch.multiprocessing as mp
|
16
17
|
|
17
18
|
from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
18
|
-
from sglang.srt.utils import cuda_device_count_stateless
|
19
19
|
|
20
20
|
logger = logging.getLogger(__name__)
|
21
21
|
|
@@ -218,7 +218,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
|
218
218
|
|
219
219
|
is_distributed = dist.is_initialized()
|
220
220
|
|
221
|
-
num_dev =
|
221
|
+
num_dev = torch.cuda.device_count()
|
222
222
|
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
|
223
223
|
if cuda_visible_devices is None:
|
224
224
|
cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
|
@@ -264,10 +264,16 @@ class GroupCoordinator:
|
|
264
264
|
self.ca_comm: Optional[CustomAllreduce] = None
|
265
265
|
if use_custom_allreduce and self.world_size > 1:
|
266
266
|
# Initialize a custom fast all-reduce implementation.
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
267
|
+
try:
|
268
|
+
self.ca_comm = CustomAllreduce(
|
269
|
+
group=self.cpu_group,
|
270
|
+
device=self.device,
|
271
|
+
)
|
272
|
+
except Exception as e:
|
273
|
+
logger.warning(
|
274
|
+
f"Setup Custom allreduce failed with {e}. To silence this "
|
275
|
+
"warning, specify --disable-custom-all-reduce explicitly."
|
276
|
+
)
|
271
277
|
|
272
278
|
from sglang.srt.distributed.device_communicators.hpu_communicator import (
|
273
279
|
HpuCommunicator,
|
@@ -439,6 +445,15 @@ class GroupCoordinator:
|
|
439
445
|
else:
|
440
446
|
torch.distributed.all_reduce(input_, group=self.device_group)
|
441
447
|
|
448
|
+
def reduce_scatter(
|
449
|
+
self,
|
450
|
+
output: torch.Tensor,
|
451
|
+
input_list: List[torch.Tensor],
|
452
|
+
) -> None:
|
453
|
+
# TODO(ch-wan): support other backends
|
454
|
+
torch.distributed.reduce_scatter(output, input_list, group=self.device_group)
|
455
|
+
return output
|
456
|
+
|
442
457
|
def _all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
|
443
458
|
pynccl_comm = self.pynccl_comm
|
444
459
|
if pynccl_comm is not None and not pynccl_comm.disabled:
|
@@ -456,11 +471,23 @@ class GroupCoordinator:
|
|
456
471
|
output, input, group_name=self.unique_name
|
457
472
|
)
|
458
473
|
|
459
|
-
def all_gather(
|
474
|
+
def all_gather(
|
475
|
+
self,
|
476
|
+
input_: torch.Tensor,
|
477
|
+
dim: int = -1,
|
478
|
+
tensor_list: List[torch.Tensor] = None,
|
479
|
+
) -> torch.Tensor:
|
460
480
|
world_size = self.world_size
|
461
481
|
# Bypass the function if we are using only 1 GPU.
|
462
482
|
if world_size == 1:
|
463
483
|
return input_
|
484
|
+
|
485
|
+
if tensor_list is not None:
|
486
|
+
# TODO(ch-wan): support other backends
|
487
|
+
return torch.distributed.all_gather(
|
488
|
+
tensor_list, input_, group=self.device_group
|
489
|
+
)
|
490
|
+
|
464
491
|
assert (
|
465
492
|
-input_.dim() <= dim < input_.dim()
|
466
493
|
), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -151,10 +151,6 @@ class Engine:
|
|
151
151
|
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
|
152
152
|
Please refer to `GenerateReqInput` for the documentation.
|
153
153
|
"""
|
154
|
-
modalities_list = []
|
155
|
-
if image_data is not None:
|
156
|
-
modalities_list.append("image")
|
157
|
-
|
158
154
|
obj = GenerateReqInput(
|
159
155
|
text=prompt,
|
160
156
|
input_ids=input_ids,
|
@@ -165,7 +161,6 @@ class Engine:
|
|
165
161
|
top_logprobs_num=top_logprobs_num,
|
166
162
|
token_ids_logprob=token_ids_logprob,
|
167
163
|
lora_path=lora_path,
|
168
|
-
modalities=modalities_list,
|
169
164
|
custom_logit_processor=custom_logit_processor,
|
170
165
|
return_hidden_states=return_hidden_states,
|
171
166
|
stream=stream,
|
@@ -561,7 +561,13 @@ def available_models():
|
|
561
561
|
served_model_names = [_global_state.tokenizer_manager.served_model_name]
|
562
562
|
model_cards = []
|
563
563
|
for served_model_name in served_model_names:
|
564
|
-
model_cards.append(
|
564
|
+
model_cards.append(
|
565
|
+
ModelCard(
|
566
|
+
id=served_model_name,
|
567
|
+
root=served_model_name,
|
568
|
+
max_model_len=_global_state.tokenizer_manager.model_config.context_len,
|
569
|
+
)
|
570
|
+
)
|
565
571
|
return ModelList(data=model_cards)
|
566
572
|
|
567
573
|
|
@@ -19,6 +19,7 @@ import torch.distributed as dist
|
|
19
19
|
from torch.distributed.tensor import DeviceMesh, DTensor
|
20
20
|
|
21
21
|
from sglang.srt.model_executor.model_runner import LocalSerializedTensor
|
22
|
+
from sglang.srt.patch_torch import monkey_patch_torch_reductions
|
22
23
|
from sglang.srt.server import Engine
|
23
24
|
from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj
|
24
25
|
|
@@ -30,6 +31,7 @@ class VerlEngine:
|
|
30
31
|
nnodes: int = 1,
|
31
32
|
**kwargs,
|
32
33
|
):
|
34
|
+
monkey_patch_torch_reductions()
|
33
35
|
self._device_mesh_cpu = device_mesh_cpu
|
34
36
|
self._tp_rank = device_mesh_cpu.get_local_rank()
|
35
37
|
self._tp_size = device_mesh_cpu.size()
|