sglang 0.4.1.post5__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +21 -23
- sglang/api.py +2 -7
- sglang/bench_offline_throughput.py +24 -16
- sglang/bench_one_batch.py +51 -3
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +37 -28
- sglang/lang/backend/runtime_endpoint.py +183 -4
- sglang/lang/chat_template.py +15 -4
- sglang/launch_server.py +1 -1
- sglang/srt/_custom_ops.py +80 -42
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/model_config.py +16 -6
- sglang/srt/constrained/base_grammar_backend.py +21 -0
- sglang/srt/constrained/xgrammar_backend.py +8 -4
- sglang/srt/conversation.py +14 -1
- sglang/srt/distributed/__init__.py +3 -3
- sglang/srt/distributed/communication_op.py +2 -1
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
- sglang/srt/distributed/device_communicators/pynccl.py +80 -1
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
- sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
- sglang/srt/distributed/parallel_state.py +1 -1
- sglang/srt/distributed/utils.py +2 -1
- sglang/srt/entrypoints/engine.py +449 -0
- sglang/srt/entrypoints/http_server.py +579 -0
- sglang/srt/layers/activation.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +27 -12
- sglang/srt/layers/attention/triton_backend.py +4 -6
- sglang/srt/layers/attention/vision.py +204 -0
- sglang/srt/layers/dp_attention.py +69 -0
- sglang/srt/layers/linear.py +76 -102
- sglang/srt/layers/logits_processor.py +48 -63
- sglang/srt/layers/moe/ep_moe/layer.py +4 -4
- sglang/srt/layers/moe/fused_moe_native.py +69 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
- sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -14
- sglang/srt/layers/moe/topk.py +4 -2
- sglang/srt/layers/parameter.py +26 -17
- sglang/srt/layers/quantization/__init__.py +22 -23
- sglang/srt/layers/quantization/fp8.py +112 -55
- sglang/srt/layers/quantization/fp8_utils.py +1 -1
- sglang/srt/layers/quantization/int8_kernel.py +54 -0
- sglang/srt/layers/quantization/modelopt_quant.py +2 -3
- sglang/srt/layers/quantization/w8a8_int8.py +117 -0
- sglang/srt/layers/radix_attention.py +2 -0
- sglang/srt/layers/rotary_embedding.py +1179 -31
- sglang/srt/layers/sampler.py +39 -1
- sglang/srt/layers/vocab_parallel_embedding.py +17 -4
- sglang/srt/lora/lora.py +1 -9
- sglang/srt/managers/configure_logging.py +46 -0
- sglang/srt/managers/data_parallel_controller.py +79 -72
- sglang/srt/managers/detokenizer_manager.py +23 -8
- sglang/srt/managers/image_processor.py +158 -2
- sglang/srt/managers/io_struct.py +54 -15
- sglang/srt/managers/schedule_batch.py +49 -22
- sglang/srt/managers/schedule_policy.py +26 -12
- sglang/srt/managers/scheduler.py +319 -181
- sglang/srt/managers/session_controller.py +1 -0
- sglang/srt/managers/tokenizer_manager.py +303 -158
- sglang/srt/managers/tp_worker.py +6 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
- sglang/srt/managers/utils.py +44 -0
- sglang/srt/mem_cache/memory_pool.py +110 -77
- sglang/srt/metrics/collector.py +25 -11
- sglang/srt/model_executor/cuda_graph_runner.py +4 -6
- sglang/srt/model_executor/model_runner.py +80 -21
- sglang/srt/model_loader/loader.py +8 -6
- sglang/srt/model_loader/weight_utils.py +55 -2
- sglang/srt/models/baichuan.py +6 -6
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +3 -3
- sglang/srt/models/dbrx.py +4 -4
- sglang/srt/models/deepseek.py +3 -3
- sglang/srt/models/deepseek_v2.py +8 -8
- sglang/srt/models/exaone.py +2 -2
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +6 -24
- sglang/srt/models/gpt2.py +3 -5
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/granite.py +2 -2
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/internlm2.py +2 -2
- sglang/srt/models/llama.py +41 -4
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/minicpm3.py +6 -6
- sglang/srt/models/minicpmv.py +1238 -0
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mixtral_quant.py +3 -3
- sglang/srt/models/mllama.py +2 -2
- sglang/srt/models/olmo.py +3 -3
- sglang/srt/models/olmo2.py +4 -4
- sglang/srt/models/olmoe.py +7 -13
- sglang/srt/models/phi3_small.py +2 -2
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +52 -4
- sglang/srt/models/qwen2_eagle.py +131 -0
- sglang/srt/models/qwen2_moe.py +3 -3
- sglang/srt/models/qwen2_vl.py +22 -122
- sglang/srt/models/stablelm.py +2 -2
- sglang/srt/models/torch_native_llama.py +3 -3
- sglang/srt/models/xverse.py +6 -6
- sglang/srt/models/xverse_moe.py +6 -6
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/sampling/custom_logit_processor.py +38 -0
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
- sglang/srt/sampling/sampling_batch_info.py +153 -9
- sglang/srt/sampling/sampling_params.py +4 -2
- sglang/srt/server.py +4 -1037
- sglang/srt/server_args.py +84 -32
- sglang/srt/speculative/eagle_worker.py +1 -0
- sglang/srt/torch_memory_saver_adapter.py +59 -0
- sglang/srt/utils.py +130 -63
- sglang/test/runners.py +8 -13
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +3 -1
- sglang/utils.py +12 -2
- sglang/version.py +1 -1
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +26 -13
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +126 -117
- sglang/launch_server_llavavid.py +0 -25
- sglang/srt/constrained/__init__.py +0 -16
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0
sglang/srt/models/granite.py
CHANGED
@@ -22,9 +22,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
22
22
|
import torch
|
23
23
|
from torch import nn
|
24
24
|
from transformers import GraniteConfig
|
25
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
27
25
|
|
26
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
28
27
|
from sglang.srt.layers.activation import SiluAndMul
|
29
28
|
from sglang.srt.layers.layernorm import RMSNorm
|
30
29
|
from sglang.srt.layers.linear import (
|
@@ -36,6 +35,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
|
|
36
35
|
from sglang.srt.layers.pooler import Pooler, PoolingType
|
37
36
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
38
37
|
from sglang.srt.layers.radix_attention import RadixAttention
|
38
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
39
39
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
40
40
|
ParallelLMHead,
|
41
41
|
VocabParallelEmbedding,
|
sglang/srt/models/grok.py
CHANGED
@@ -22,12 +22,11 @@ import torch
|
|
22
22
|
import torch.nn.functional as F
|
23
23
|
from torch import nn
|
24
24
|
from transformers import PretrainedConfig
|
25
|
-
|
25
|
+
|
26
|
+
from sglang.srt.distributed import (
|
26
27
|
get_tensor_model_parallel_rank,
|
27
28
|
get_tensor_model_parallel_world_size,
|
28
29
|
)
|
29
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
30
|
-
|
31
30
|
from sglang.srt.layers.activation import GeluAndMul
|
32
31
|
from sglang.srt.layers.layernorm import RMSNorm
|
33
32
|
from sglang.srt.layers.linear import (
|
@@ -40,6 +39,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
40
39
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
41
40
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
43
43
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
44
44
|
ParallelLMHead,
|
45
45
|
VocabParallelEmbedding,
|
sglang/srt/models/internlm2.py
CHANGED
@@ -19,9 +19,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
19
19
|
import torch
|
20
20
|
from torch import nn
|
21
21
|
from transformers import PretrainedConfig
|
22
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
23
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
24
22
|
|
23
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
25
24
|
from sglang.srt.layers.activation import SiluAndMul
|
26
25
|
from sglang.srt.layers.layernorm import RMSNorm
|
27
26
|
from sglang.srt.layers.linear import (
|
@@ -32,6 +31,7 @@ from sglang.srt.layers.linear import (
|
|
32
31
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
33
32
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
34
33
|
from sglang.srt.layers.radix_attention import RadixAttention
|
34
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
35
35
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
36
36
|
ParallelLMHead,
|
37
37
|
VocabParallelEmbedding,
|
sglang/srt/models/llama.py
CHANGED
@@ -22,9 +22,11 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
22
22
|
import torch
|
23
23
|
from torch import nn
|
24
24
|
from transformers import LlamaConfig
|
25
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
27
25
|
|
26
|
+
from sglang.srt.distributed import (
|
27
|
+
get_tensor_model_parallel_rank,
|
28
|
+
get_tensor_model_parallel_world_size,
|
29
|
+
)
|
28
30
|
from sglang.srt.layers.activation import SiluAndMul
|
29
31
|
from sglang.srt.layers.layernorm import RMSNorm
|
30
32
|
from sglang.srt.layers.linear import (
|
@@ -36,12 +38,16 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
|
|
36
38
|
from sglang.srt.layers.pooler import Pooler, PoolingType
|
37
39
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
38
40
|
from sglang.srt.layers.radix_attention import RadixAttention
|
41
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
39
42
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
40
43
|
ParallelLMHead,
|
41
44
|
VocabParallelEmbedding,
|
42
45
|
)
|
43
46
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
44
|
-
from sglang.srt.model_loader.weight_utils import
|
47
|
+
from sglang.srt.model_loader.weight_utils import (
|
48
|
+
default_weight_loader,
|
49
|
+
kv_cache_scales_loader,
|
50
|
+
)
|
45
51
|
from sglang.srt.utils import make_layers
|
46
52
|
from sglang.utils import get_exception_traceback
|
47
53
|
|
@@ -299,6 +305,30 @@ class LlamaModel(nn.Module):
|
|
299
305
|
hidden_states, _ = self.norm(hidden_states, residual)
|
300
306
|
return hidden_states
|
301
307
|
|
308
|
+
# If this function is called, it should always initialize KV cache scale
|
309
|
+
# factors (or else raise an exception). Thus, handled exceptions should
|
310
|
+
# make sure to leave KV cache scale factors in a known good (dummy) state
|
311
|
+
def load_kv_cache_scales(self, quantization_param_path: str) -> None:
|
312
|
+
tp_size = get_tensor_model_parallel_world_size()
|
313
|
+
tp_rank = get_tensor_model_parallel_rank()
|
314
|
+
for layer_idx, scaling_factor in kv_cache_scales_loader(
|
315
|
+
quantization_param_path,
|
316
|
+
tp_rank,
|
317
|
+
tp_size,
|
318
|
+
self.config.num_hidden_layers,
|
319
|
+
self.config.__class__.model_type,
|
320
|
+
):
|
321
|
+
if not isinstance(self.layers[layer_idx], nn.Identity):
|
322
|
+
layer_self_attn = self.layers[layer_idx].self_attn
|
323
|
+
|
324
|
+
if hasattr(layer_self_attn.attn, "k_scale"):
|
325
|
+
layer_self_attn.attn.k_scale = scaling_factor
|
326
|
+
layer_self_attn.attn.v_scale = scaling_factor
|
327
|
+
else:
|
328
|
+
raise RuntimeError(
|
329
|
+
"Self attention has no KV cache scaling " "factor attribute!"
|
330
|
+
)
|
331
|
+
|
302
332
|
|
303
333
|
class LlamaForCausalLM(nn.Module):
|
304
334
|
|
@@ -534,9 +564,16 @@ class LlamaForCausalLM(nn.Module):
|
|
534
564
|
torch.cuda.empty_cache()
|
535
565
|
torch.cuda.synchronize()
|
536
566
|
|
567
|
+
def load_kv_cache_scales(self, quantization_param_path: str) -> None:
|
568
|
+
self.model.load_kv_cache_scales(quantization_param_path)
|
569
|
+
|
537
570
|
|
538
571
|
class Phi3ForCausalLM(LlamaForCausalLM):
|
539
572
|
pass
|
540
573
|
|
541
574
|
|
542
|
-
|
575
|
+
class InternLM3ForCausalLM(LlamaForCausalLM):
|
576
|
+
pass
|
577
|
+
|
578
|
+
|
579
|
+
EntryClass = [LlamaForCausalLM, Phi3ForCausalLM, InternLM3ForCausalLM]
|
sglang/srt/models/minicpm.py
CHANGED
@@ -18,9 +18,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
18
18
|
|
19
19
|
import torch
|
20
20
|
from torch import nn
|
21
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
22
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
23
21
|
|
22
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
24
23
|
from sglang.srt.layers.activation import SiluAndMul
|
25
24
|
from sglang.srt.layers.layernorm import RMSNorm
|
26
25
|
from sglang.srt.layers.linear import (
|
@@ -31,6 +30,7 @@ from sglang.srt.layers.linear import (
|
|
31
30
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
32
31
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
33
32
|
from sglang.srt.layers.radix_attention import RadixAttention
|
33
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
34
34
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
35
35
|
ParallelLMHead,
|
36
36
|
VocabParallelEmbedding,
|
sglang/srt/models/minicpm3.py
CHANGED
@@ -19,20 +19,20 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
19
19
|
import torch
|
20
20
|
from torch import nn
|
21
21
|
from transformers import PretrainedConfig
|
22
|
-
|
23
|
-
from
|
22
|
+
|
23
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
24
|
+
from sglang.srt.layers.activation import SiluAndMul
|
25
|
+
from sglang.srt.layers.layernorm import RMSNorm
|
26
|
+
from sglang.srt.layers.linear import (
|
24
27
|
ColumnParallelLinear,
|
25
28
|
MergedColumnParallelLinear,
|
26
29
|
ReplicatedLinear,
|
27
30
|
RowParallelLinear,
|
28
31
|
)
|
29
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
30
|
-
|
31
|
-
from sglang.srt.layers.activation import SiluAndMul
|
32
|
-
from sglang.srt.layers.layernorm import RMSNorm
|
33
32
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
34
33
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
35
34
|
from sglang.srt.layers.radix_attention import RadixAttention
|
35
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
36
36
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
37
37
|
ParallelLMHead,
|
38
38
|
VocabParallelEmbedding,
|