sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +302 -414
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +13 -8
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +144 -6
- sglang/srt/managers/schedule_batch.py +237 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +773 -334
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +225 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +68 -37
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +102 -36
- sglang/srt/model_executor/cuda_graph_runner.py +56 -31
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +280 -81
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -32
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +135 -60
- sglang/srt/speculative/build_eagle_tree.py +8 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
- sglang/srt/speculative/eagle_utils.py +92 -57
- sglang/srt/speculative/eagle_worker.py +238 -111
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -32,13 +32,15 @@ import socket
|
|
32
32
|
import subprocess
|
33
33
|
import sys
|
34
34
|
import tempfile
|
35
|
+
import threading
|
35
36
|
import time
|
36
37
|
import warnings
|
37
38
|
from functools import lru_cache
|
38
39
|
from importlib.metadata import PackageNotFoundError, version
|
39
40
|
from io import BytesIO
|
41
|
+
from multiprocessing import Pool
|
40
42
|
from multiprocessing.reduction import ForkingPickler
|
41
|
-
from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
|
43
|
+
from typing import Any, Callable, Dict, List, Optional, Protocol, Set, Tuple, Union
|
42
44
|
|
43
45
|
import numpy as np
|
44
46
|
import psutil
|
@@ -311,7 +313,7 @@ def make_layers(
|
|
311
313
|
"""Make a list of layers with the given layer function"""
|
312
314
|
modules = torch.nn.ModuleList(
|
313
315
|
[
|
314
|
-
maybe_offload_to_cpu(layer_fn(idx=idx, prefix=
|
316
|
+
maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
|
315
317
|
for idx in range(num_hidden_layers)
|
316
318
|
]
|
317
319
|
)
|
@@ -480,6 +482,10 @@ def assert_pkg_version(pkg: str, min_version: str, message: str):
|
|
480
482
|
|
481
483
|
def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = None):
|
482
484
|
"""Kill the process and all its child processes."""
|
485
|
+
# Remove sigchld handler to avoid spammy logs.
|
486
|
+
if threading.current_thread() is threading.main_thread():
|
487
|
+
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
|
488
|
+
|
483
489
|
if parent_pid is None:
|
484
490
|
parent_pid = os.getpid()
|
485
491
|
include_parent = False
|
@@ -735,13 +741,6 @@ def pytorch_profile(name, func, *args, data_size=-1):
|
|
735
741
|
return result
|
736
742
|
|
737
743
|
|
738
|
-
def first_rank_print(*args, **kwargs):
|
739
|
-
if torch.cuda.current_device() == 0:
|
740
|
-
print(*args, **kwargs)
|
741
|
-
else:
|
742
|
-
pass
|
743
|
-
|
744
|
-
|
745
744
|
def get_zmq_socket(
|
746
745
|
context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
|
747
746
|
):
|
@@ -1154,9 +1153,9 @@ def set_gpu_proc_affinity(
|
|
1154
1153
|
|
1155
1154
|
if psutil.cpu_count() != psutil.cpu_count(logical=False):
|
1156
1155
|
# HT on
|
1157
|
-
|
1158
|
-
|
1159
|
-
bind_cpu_ids = list(itertools.chain(
|
1156
|
+
lower_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
|
1157
|
+
upper_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
|
1158
|
+
bind_cpu_ids = list(itertools.chain(lower_cpu_ids, upper_cpu_ids))
|
1160
1159
|
else:
|
1161
1160
|
# HT off
|
1162
1161
|
bind_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
|
@@ -1171,6 +1170,11 @@ def get_bool_env_var(name: str, default: str = "false") -> bool:
|
|
1171
1170
|
return value.lower() in ("true", "1")
|
1172
1171
|
|
1173
1172
|
|
1173
|
+
@lru_cache(maxsize=2)
|
1174
|
+
def disable_request_logging() -> bool:
|
1175
|
+
return get_bool_env_var("SGLANG_DISABLE_REQUEST_LOGGING")
|
1176
|
+
|
1177
|
+
|
1174
1178
|
@lru_cache(maxsize=8)
|
1175
1179
|
def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) -> int:
|
1176
1180
|
# Note: cuda_visible_devices is not used, but we keep it as an argument for
|
@@ -1212,7 +1216,11 @@ def cuda_device_count_stateless() -> int:
|
|
1212
1216
|
return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES", None))
|
1213
1217
|
|
1214
1218
|
|
1215
|
-
def dataclass_to_string_truncated(
|
1219
|
+
def dataclass_to_string_truncated(
|
1220
|
+
data, max_length=2048, skip_names: Optional[Set[str]] = None
|
1221
|
+
):
|
1222
|
+
if skip_names is None:
|
1223
|
+
skip_names = set()
|
1216
1224
|
if isinstance(data, str):
|
1217
1225
|
if len(data) > max_length:
|
1218
1226
|
half_length = max_length // 2
|
@@ -1231,6 +1239,7 @@ def dataclass_to_string_truncated(data, max_length=2048):
|
|
1231
1239
|
+ ", ".join(
|
1232
1240
|
f"'{k}': {dataclass_to_string_truncated(v, max_length)}"
|
1233
1241
|
for k, v in data.items()
|
1242
|
+
if k not in skip_names
|
1234
1243
|
)
|
1235
1244
|
+ "}"
|
1236
1245
|
)
|
@@ -1241,6 +1250,7 @@ def dataclass_to_string_truncated(data, max_length=2048):
|
|
1241
1250
|
+ ", ".join(
|
1242
1251
|
f"{f.name}={dataclass_to_string_truncated(getattr(data, f.name), max_length)}"
|
1243
1252
|
for f in fields
|
1253
|
+
if f.name not in skip_names
|
1244
1254
|
)
|
1245
1255
|
+ ")"
|
1246
1256
|
)
|
@@ -1289,7 +1299,7 @@ def debug_timing(func):
|
|
1289
1299
|
tic.record()
|
1290
1300
|
result = func(*args, **kwargs)
|
1291
1301
|
toc.record()
|
1292
|
-
|
1302
|
+
toc.synchronize() # Wait for the function to complete without synchronizing all ops on the GPU
|
1293
1303
|
elapsed = tic.elapsed_time(toc)
|
1294
1304
|
indices = kwargs.get("indices", args[1] if len(args) > 1 else None)
|
1295
1305
|
num_tokens = len(indices) if indices is not None else 0
|
@@ -1319,9 +1329,9 @@ def pyspy_dump_schedulers():
|
|
1319
1329
|
result = subprocess.run(
|
1320
1330
|
cmd, shell=True, capture_output=True, text=True, check=True
|
1321
1331
|
)
|
1322
|
-
logger.
|
1332
|
+
logger.error(f"Pyspy dump for PID {pid}:\n{result.stdout}")
|
1323
1333
|
except subprocess.CalledProcessError as e:
|
1324
|
-
logger.
|
1334
|
+
logger.error(f"Pyspy failed to dump PID {pid}. Error: {e.stderr}")
|
1325
1335
|
|
1326
1336
|
|
1327
1337
|
def kill_itself_when_parent_died():
|
@@ -1383,7 +1393,6 @@ def get_ip() -> str:
|
|
1383
1393
|
|
1384
1394
|
|
1385
1395
|
def get_open_port() -> int:
|
1386
|
-
|
1387
1396
|
port = os.getenv("SGLANG_PORT")
|
1388
1397
|
if port is not None:
|
1389
1398
|
while True:
|
@@ -1446,8 +1455,25 @@ def launch_dummy_health_check_server(host, port):
|
|
1446
1455
|
)
|
1447
1456
|
|
1448
1457
|
|
1458
|
+
def create_checksum(directory: str):
|
1459
|
+
raise NotImplementedError()
|
1460
|
+
|
1461
|
+
|
1449
1462
|
def set_cuda_arch():
|
1450
1463
|
if is_flashinfer_available():
|
1451
1464
|
capability = torch.cuda.get_device_capability()
|
1452
1465
|
arch = f"{capability[0]}.{capability[1]}"
|
1453
1466
|
os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
|
1467
|
+
|
1468
|
+
|
1469
|
+
def add_prefix(name: str, prefix: str) -> str:
|
1470
|
+
"""Add a weight path prefix to a module name.
|
1471
|
+
|
1472
|
+
Args:
|
1473
|
+
name: base module name.
|
1474
|
+
prefix: weight prefix str to added to the front of `name` concatenated with `.`.
|
1475
|
+
|
1476
|
+
Returns:
|
1477
|
+
The string `prefix.name` if prefix is non-empty, otherwise just `name`.
|
1478
|
+
"""
|
1479
|
+
return name if not prefix else f"{prefix}.{name}"
|
sglang/srt/warmup.py
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
import tqdm
|
6
|
+
|
7
|
+
from sglang.srt.managers.io_struct import GenerateReqInput
|
8
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
9
|
+
|
10
|
+
logger = logging.getLogger(__file__)
|
11
|
+
|
12
|
+
_warmup_registry = {}
|
13
|
+
|
14
|
+
|
15
|
+
def warmup(name: str) -> callable:
|
16
|
+
def decorator(fn: callable):
|
17
|
+
_warmup_registry[name] = fn
|
18
|
+
return fn
|
19
|
+
|
20
|
+
return decorator
|
21
|
+
|
22
|
+
|
23
|
+
async def execute_warmups(warmup_names: List[str], tokenizer_manager: TokenizerManager):
|
24
|
+
for warmup_name in warmup_names:
|
25
|
+
if warmup_name not in _warmup_registry:
|
26
|
+
logger.warning(f"Could not find custom warmup {warmup_name}")
|
27
|
+
continue
|
28
|
+
logger.info(f"Running warmup {warmup_name}")
|
29
|
+
await _warmup_registry[warmup_name](tokenizer_manager)
|
30
|
+
|
31
|
+
|
32
|
+
@warmup("voice_chat")
|
33
|
+
async def voice_chat(tokenizer_manager: TokenizerManager):
|
34
|
+
# this warms up the fused_moe triton kernels and caches them
|
35
|
+
# if we don't do this we break real time inference for voice chat
|
36
|
+
for i in tqdm.trange(1, 512):
|
37
|
+
size = i * 4
|
38
|
+
generate_req_input = GenerateReqInput(
|
39
|
+
input_ids=(np.random.randint(2**16, size=[size])).tolist(),
|
40
|
+
sampling_params={
|
41
|
+
"max_new_tokens": 30,
|
42
|
+
"temperature": 0.8,
|
43
|
+
"stop_token_ids": [1],
|
44
|
+
"min_p": 0.0,
|
45
|
+
},
|
46
|
+
)
|
47
|
+
await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
|
sglang/test/few_shot_gsm8k.py
CHANGED
@@ -93,9 +93,11 @@ def run_eval(args):
|
|
93
93
|
tic = time.time()
|
94
94
|
states = few_shot_gsm8k.run_batch(
|
95
95
|
arguments,
|
96
|
-
temperature=0,
|
96
|
+
temperature=args.temperature if hasattr(args, "temperature") else 0,
|
97
97
|
num_threads=args.parallel,
|
98
98
|
progress_bar=True,
|
99
|
+
return_logprob=getattr(args, "return_logprob", None),
|
100
|
+
logprob_start_len=getattr(args, "logprob_start_len", None),
|
99
101
|
)
|
100
102
|
latency = time.time() - tic
|
101
103
|
|
@@ -141,5 +143,6 @@ if __name__ == "__main__":
|
|
141
143
|
parser.add_argument("--parallel", type=int, default=128)
|
142
144
|
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
143
145
|
parser.add_argument("--port", type=int, default=30000)
|
146
|
+
parser.add_argument("--temperature", type=float, default=0.0)
|
144
147
|
args = parser.parse_args()
|
145
148
|
run_eval(args)
|