sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +208 -295
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +9 -6
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +143 -6
- sglang/srt/managers/schedule_batch.py +238 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +681 -259
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +224 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +44 -18
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +94 -36
- sglang/srt/model_executor/cuda_graph_runner.py +55 -24
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +209 -28
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -29
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +136 -52
- sglang/srt/speculative/build_eagle_tree.py +2 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
- sglang/srt/speculative/eagle_utils.py +92 -58
- sglang/srt/speculative/eagle_worker.py +186 -94
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +214 -166
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
sglang/srt/entrypoints/engine.py
CHANGED
@@ -44,6 +44,7 @@ from sglang.srt.managers.io_struct import (
|
|
44
44
|
InitWeightsUpdateGroupReqInput,
|
45
45
|
ReleaseMemoryOccupationReqInput,
|
46
46
|
ResumeMemoryOccupationReqInput,
|
47
|
+
UpdateWeightFromDiskReqInput,
|
47
48
|
UpdateWeightsFromDistributedReqInput,
|
48
49
|
UpdateWeightsFromTensorReqInput,
|
49
50
|
)
|
@@ -98,7 +99,7 @@ class Engine:
|
|
98
99
|
kwargs["log_level"] = "error"
|
99
100
|
server_args = ServerArgs(**kwargs)
|
100
101
|
|
101
|
-
# Shutdown the subprocesses automatically when the program
|
102
|
+
# Shutdown the subprocesses automatically when the program exits
|
102
103
|
atexit.register(self.shutdown)
|
103
104
|
|
104
105
|
# Launch subprocesses
|
@@ -121,8 +122,10 @@ class Engine:
|
|
121
122
|
return_logprob: Optional[Union[List[bool], bool]] = False,
|
122
123
|
logprob_start_len: Optional[Union[List[int], int]] = None,
|
123
124
|
top_logprobs_num: Optional[Union[List[int], int]] = None,
|
125
|
+
token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
|
124
126
|
lora_path: Optional[List[Optional[str]]] = None,
|
125
127
|
custom_logit_processor: Optional[Union[List[str], str]] = None,
|
128
|
+
return_hidden_states: bool = False,
|
126
129
|
stream: bool = False,
|
127
130
|
) -> Union[Dict, Iterator[Dict]]:
|
128
131
|
"""
|
@@ -141,9 +144,11 @@ class Engine:
|
|
141
144
|
return_logprob=return_logprob,
|
142
145
|
logprob_start_len=logprob_start_len,
|
143
146
|
top_logprobs_num=top_logprobs_num,
|
147
|
+
token_ids_logprob=token_ids_logprob,
|
144
148
|
lora_path=lora_path,
|
145
149
|
modalities=modalities_list,
|
146
150
|
custom_logit_processor=custom_logit_processor,
|
151
|
+
return_hidden_states=return_hidden_states,
|
147
152
|
stream=stream,
|
148
153
|
)
|
149
154
|
loop = asyncio.get_event_loop()
|
@@ -177,6 +182,7 @@ class Engine:
|
|
177
182
|
return_logprob: Optional[Union[List[bool], bool]] = False,
|
178
183
|
logprob_start_len: Optional[Union[List[int], int]] = None,
|
179
184
|
top_logprobs_num: Optional[Union[List[int], int]] = None,
|
185
|
+
token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
|
180
186
|
lora_path: Optional[List[Optional[str]]] = None,
|
181
187
|
custom_logit_processor: Optional[Union[List[str], str]] = None,
|
182
188
|
stream: bool = False,
|
@@ -193,6 +199,7 @@ class Engine:
|
|
193
199
|
return_logprob=return_logprob,
|
194
200
|
logprob_start_len=logprob_start_len,
|
195
201
|
top_logprobs_num=top_logprobs_num,
|
202
|
+
token_ids_logprob=token_ids_logprob,
|
196
203
|
lora_path=lora_path,
|
197
204
|
stream=stream,
|
198
205
|
custom_logit_processor=custom_logit_processor,
|
@@ -224,15 +231,22 @@ class Engine:
|
|
224
231
|
kill_process_tree(os.getpid(), include_parent=False)
|
225
232
|
|
226
233
|
def start_profile(self):
|
227
|
-
|
234
|
+
loop = asyncio.get_event_loop()
|
235
|
+
loop.run_until_complete(self.tokenizer_manager.start_profile())
|
228
236
|
|
229
237
|
def stop_profile(self):
|
230
238
|
self.tokenizer_manager.stop_profile()
|
231
239
|
|
232
240
|
def get_server_info(self):
|
241
|
+
loop = asyncio.get_event_loop()
|
242
|
+
internal_states = loop.run_until_complete(
|
243
|
+
self.tokenizer_manager.get_internal_state()
|
244
|
+
)
|
245
|
+
|
233
246
|
return {
|
234
|
-
**dataclasses.asdict(self.tokenizer_manager.server_args),
|
247
|
+
**dataclasses.asdict(self.tokenizer_manager.server_args),
|
235
248
|
**self.scheduler_info,
|
249
|
+
**internal_states,
|
236
250
|
"version": __version__,
|
237
251
|
}
|
238
252
|
|
@@ -271,16 +285,45 @@ class Engine:
|
|
271
285
|
self.tokenizer_manager.update_weights_from_distributed(obj, None)
|
272
286
|
)
|
273
287
|
|
274
|
-
def update_weights_from_tensor(
|
275
|
-
|
288
|
+
def update_weights_from_tensor(
|
289
|
+
self,
|
290
|
+
named_tensors: List[Tuple[str, torch.Tensor]],
|
291
|
+
load_format: Optional[str] = None,
|
292
|
+
flush_cache: bool = True,
|
293
|
+
):
|
294
|
+
"""Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be true
|
295
|
+
to avoid duplicated operations such as clearing cache."""
|
276
296
|
obj = UpdateWeightsFromTensorReqInput(
|
277
|
-
serialized_named_tensors=MultiprocessingSerializer.serialize(named_tensors)
|
297
|
+
serialized_named_tensors=MultiprocessingSerializer.serialize(named_tensors),
|
298
|
+
load_format=load_format,
|
299
|
+
flush_cache=flush_cache,
|
278
300
|
)
|
279
301
|
loop = asyncio.get_event_loop()
|
280
302
|
return loop.run_until_complete(
|
281
303
|
self.tokenizer_manager.update_weights_from_tensor(obj, None)
|
282
304
|
)
|
283
305
|
|
306
|
+
def update_weights_from_disk(
|
307
|
+
self,
|
308
|
+
model_path: str,
|
309
|
+
load_format: Optional[str] = None,
|
310
|
+
):
|
311
|
+
"""Update the weights from disk inplace without re-launching the engine.
|
312
|
+
|
313
|
+
This method allows updating the model weights from disk without restarting
|
314
|
+
the engine. It can be used to load a different model or update weights with
|
315
|
+
new training.
|
316
|
+
"""
|
317
|
+
obj = UpdateWeightFromDiskReqInput(
|
318
|
+
model_path=model_path,
|
319
|
+
load_format=load_format,
|
320
|
+
)
|
321
|
+
|
322
|
+
loop = asyncio.get_event_loop()
|
323
|
+
return loop.run_until_complete(
|
324
|
+
self.tokenizer_manager.update_weights_from_disk(obj, None)
|
325
|
+
)
|
326
|
+
|
284
327
|
def get_weights_by_name(self, name: str, truncate_size: int = 100):
|
285
328
|
"""Get weights by parameter name."""
|
286
329
|
obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
|
@@ -313,6 +356,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
313
356
|
os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
|
314
357
|
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
315
358
|
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
359
|
+
os.environ["CUDA_MODULE_LOADING"] = "AUTO"
|
316
360
|
|
317
361
|
# Set prometheus env vars
|
318
362
|
if server_args.enable_metrics:
|
@@ -330,18 +374,29 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
330
374
|
if server_args.attention_backend == "flashinfer":
|
331
375
|
assert_pkg_version(
|
332
376
|
"flashinfer_python",
|
333
|
-
"0.2.
|
377
|
+
"0.2.2.post1",
|
334
378
|
"Please uninstall the old version and "
|
335
379
|
"reinstall the latest version by following the instructions "
|
336
380
|
"at https://docs.flashinfer.ai/installation.html.",
|
337
381
|
)
|
338
382
|
|
383
|
+
def sigchld_handler(signum, frame):
|
384
|
+
pid, exitcode = os.waitpid(0, os.WNOHANG)
|
385
|
+
if exitcode != 0:
|
386
|
+
logger.warning(
|
387
|
+
"Child process unexpectedly failed with an exit code %d. pid=%d",
|
388
|
+
exitcode,
|
389
|
+
pid,
|
390
|
+
)
|
391
|
+
|
392
|
+
signal.signal(signal.SIGCHLD, sigchld_handler)
|
393
|
+
|
339
394
|
# Register the signal handler.
|
340
395
|
# The child processes will send SIGQUIT to this process when any error happens
|
341
396
|
# This process then clean up the whole process tree
|
342
397
|
def sigquit_handler(signum, frame):
|
343
398
|
logger.error(
|
344
|
-
"Received sigquit from a child
|
399
|
+
"Received sigquit from a child process. It usually means the child failed."
|
345
400
|
)
|
346
401
|
kill_process_tree(os.getpid())
|
347
402
|
|
@@ -384,7 +439,10 @@ def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dic
|
|
384
439
|
)
|
385
440
|
for tp_rank in tp_rank_range:
|
386
441
|
reader, writer = mp.Pipe(duplex=False)
|
387
|
-
gpu_id =
|
442
|
+
gpu_id = (
|
443
|
+
server_args.base_gpu_id
|
444
|
+
+ (tp_rank % tp_size_per_node) * server_args.gpu_id_step
|
445
|
+
)
|
388
446
|
proc = mp.Process(
|
389
447
|
target=run_scheduler_process,
|
390
448
|
args=(server_args, port_args, gpu_id, tp_rank, None, writer),
|
@@ -25,11 +25,14 @@ import os
|
|
25
25
|
import threading
|
26
26
|
import time
|
27
27
|
from http import HTTPStatus
|
28
|
-
from typing import AsyncIterator, Dict, Optional
|
28
|
+
from typing import AsyncIterator, Callable, Dict, Optional
|
29
29
|
|
30
30
|
# Fix a bug of Python threading
|
31
31
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
32
32
|
|
33
|
+
from contextlib import asynccontextmanager
|
34
|
+
|
35
|
+
import numpy as np
|
33
36
|
import orjson
|
34
37
|
import requests
|
35
38
|
import uvicorn
|
@@ -44,15 +47,19 @@ from sglang.srt.managers.io_struct import (
|
|
44
47
|
CloseSessionReqInput,
|
45
48
|
ConfigureLoggingReq,
|
46
49
|
EmbeddingReqInput,
|
47
|
-
FunctionCallReqInput,
|
48
50
|
GenerateReqInput,
|
49
51
|
GetWeightsByNameReqInput,
|
50
52
|
InitWeightsUpdateGroupReqInput,
|
51
53
|
OpenSessionReqInput,
|
54
|
+
ParseFunctionCallReq,
|
55
|
+
ProfileReqInput,
|
52
56
|
ReleaseMemoryOccupationReqInput,
|
53
57
|
ResumeMemoryOccupationReqInput,
|
58
|
+
SeparateReasoningReqInput,
|
59
|
+
SetInternalStateReq,
|
54
60
|
UpdateWeightFromDiskReqInput,
|
55
61
|
UpdateWeightsFromDistributedReqInput,
|
62
|
+
VertexGenerateReqInput,
|
56
63
|
)
|
57
64
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
58
65
|
from sglang.srt.metrics.func_timer import enable_func_timer
|
@@ -69,6 +76,7 @@ from sglang.srt.openai_api.adapter import (
|
|
69
76
|
v1_retrieve_file_content,
|
70
77
|
)
|
71
78
|
from sglang.srt.openai_api.protocol import ModelCard, ModelList
|
79
|
+
from sglang.srt.reasoning_parser import ReasoningParser
|
72
80
|
from sglang.srt.server_args import ServerArgs
|
73
81
|
from sglang.srt.utils import (
|
74
82
|
add_api_key_middleware,
|
@@ -77,22 +85,13 @@ from sglang.srt.utils import (
|
|
77
85
|
kill_process_tree,
|
78
86
|
set_uvicorn_logging_configs,
|
79
87
|
)
|
88
|
+
from sglang.srt.warmup import execute_warmups
|
80
89
|
from sglang.utils import get_exception_traceback
|
81
90
|
from sglang.version import __version__
|
82
91
|
|
83
92
|
logger = logging.getLogger(__name__)
|
84
93
|
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
85
94
|
|
86
|
-
# Fast API
|
87
|
-
app = FastAPI()
|
88
|
-
app.add_middleware(
|
89
|
-
CORSMiddleware,
|
90
|
-
allow_origins=["*"],
|
91
|
-
allow_credentials=True,
|
92
|
-
allow_methods=["*"],
|
93
|
-
allow_headers=["*"],
|
94
|
-
)
|
95
|
-
|
96
95
|
|
97
96
|
# Store global states
|
98
97
|
@dataclasses.dataclass
|
@@ -109,6 +108,34 @@ def set_global_state(global_state: _GlobalState):
|
|
109
108
|
_global_state = global_state
|
110
109
|
|
111
110
|
|
111
|
+
@asynccontextmanager
|
112
|
+
async def lifespan(fast_api_app: FastAPI):
|
113
|
+
server_args: ServerArgs = fast_api_app.server_args
|
114
|
+
if server_args.warmups is not None:
|
115
|
+
await execute_warmups(
|
116
|
+
server_args.warmups.split(","), _global_state.tokenizer_manager
|
117
|
+
)
|
118
|
+
logger.info("Warmup ended")
|
119
|
+
|
120
|
+
warmup_thread = getattr(fast_api_app, "warmup_thread", None)
|
121
|
+
if warmup_thread is not None:
|
122
|
+
warmup_thread.start()
|
123
|
+
yield
|
124
|
+
|
125
|
+
|
126
|
+
# Fast API
|
127
|
+
app = FastAPI(lifespan=lifespan)
|
128
|
+
app.add_middleware(
|
129
|
+
CORSMiddleware,
|
130
|
+
allow_origins=["*"],
|
131
|
+
allow_credentials=True,
|
132
|
+
allow_methods=["*"],
|
133
|
+
allow_headers=["*"],
|
134
|
+
)
|
135
|
+
|
136
|
+
HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
137
|
+
|
138
|
+
|
112
139
|
##### Native API endpoints #####
|
113
140
|
|
114
141
|
|
@@ -122,24 +149,48 @@ async def health() -> Response:
|
|
122
149
|
async def health_generate(request: Request) -> Response:
|
123
150
|
"""Check the health of the inference server by generating one token."""
|
124
151
|
|
125
|
-
sampling_params = {"max_new_tokens": 1, "temperature": 0.
|
152
|
+
sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
|
153
|
+
rid = f"HEALTH_CHECK_{time.time()}"
|
126
154
|
|
127
|
-
if _global_state.tokenizer_manager.
|
155
|
+
if _global_state.tokenizer_manager.is_image_gen:
|
156
|
+
raise NotImplementedError()
|
157
|
+
elif _global_state.tokenizer_manager.is_generation:
|
128
158
|
gri = GenerateReqInput(
|
129
|
-
|
159
|
+
rid=rid,
|
160
|
+
input_ids=[0],
|
161
|
+
sampling_params=sampling_params,
|
162
|
+
log_metrics=False,
|
130
163
|
)
|
131
164
|
else:
|
132
165
|
gri = EmbeddingReqInput(
|
133
|
-
input_ids=[0], sampling_params=sampling_params, log_metrics=False
|
166
|
+
rid=rid, input_ids=[0], sampling_params=sampling_params, log_metrics=False
|
134
167
|
)
|
135
168
|
|
136
|
-
|
169
|
+
async def gen():
|
137
170
|
async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
|
138
171
|
break
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
172
|
+
|
173
|
+
tic = time.time()
|
174
|
+
task = asyncio.create_task(gen())
|
175
|
+
while time.time() < tic + HEALTH_CHECK_TIMEOUT:
|
176
|
+
await asyncio.sleep(1)
|
177
|
+
if _global_state.tokenizer_manager.last_receive_tstamp > tic:
|
178
|
+
task.cancel()
|
179
|
+
_global_state.tokenizer_manager.rid_to_state.pop(rid, None)
|
180
|
+
return Response(status_code=200)
|
181
|
+
|
182
|
+
task.cancel()
|
183
|
+
tic_time = time.strftime("%H:%M:%S", time.localtime(tic))
|
184
|
+
last_receive_time = time.strftime(
|
185
|
+
"%H:%M:%S", time.localtime(_global_state.tokenizer_manager.last_receive_tstamp)
|
186
|
+
)
|
187
|
+
logger.error(
|
188
|
+
f"Health check failed. Server couldn't get a response from detokenizer for last "
|
189
|
+
f"{HEALTH_CHECK_TIMEOUT} seconds. tic start time: {tic_time}. "
|
190
|
+
f"last_heartbeat time: {last_receive_time}"
|
191
|
+
)
|
192
|
+
_global_state.tokenizer_manager.rid_to_state.pop(rid, None)
|
193
|
+
return Response(status_code=503)
|
143
194
|
|
144
195
|
|
145
196
|
@app.get("/get_model_info")
|
@@ -155,13 +206,21 @@ async def get_model_info():
|
|
155
206
|
|
156
207
|
@app.get("/get_server_info")
|
157
208
|
async def get_server_info():
|
209
|
+
internal_states = await _global_state.tokenizer_manager.get_internal_state()
|
158
210
|
return {
|
159
211
|
**dataclasses.asdict(_global_state.tokenizer_manager.server_args),
|
160
212
|
**_global_state.scheduler_info,
|
213
|
+
**internal_states,
|
161
214
|
"version": __version__,
|
162
215
|
}
|
163
216
|
|
164
217
|
|
218
|
+
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
|
219
|
+
async def set_internal_state(obj: SetInternalStateReq, request: Request):
|
220
|
+
res = await _global_state.tokenizer_manager.set_internal_state(obj)
|
221
|
+
return res
|
222
|
+
|
223
|
+
|
165
224
|
# fastapi implicitly converts json in the request to obj (dataclass)
|
166
225
|
@app.api_route("/generate", methods=["POST", "PUT"])
|
167
226
|
async def generate_request(obj: GenerateReqInput, request: Request):
|
@@ -178,6 +237,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
|
|
178
237
|
) + b"\n\n"
|
179
238
|
except ValueError as e:
|
180
239
|
out = {"error": {"message": str(e)}}
|
240
|
+
logger.error(f"Error: {e}")
|
181
241
|
yield b"data: " + orjson.dumps(
|
182
242
|
out, option=orjson.OPT_NON_STR_KEYS
|
183
243
|
) + b"\n\n"
|
@@ -235,9 +295,14 @@ async def flush_cache():
|
|
235
295
|
|
236
296
|
|
237
297
|
@app.api_route("/start_profile", methods=["GET", "POST"])
|
238
|
-
async def start_profile_async():
|
298
|
+
async def start_profile_async(obj: Optional[ProfileReqInput] = None):
|
239
299
|
"""Start profiling."""
|
240
|
-
|
300
|
+
if obj is None:
|
301
|
+
obj = ProfileReqInput()
|
302
|
+
|
303
|
+
await _global_state.tokenizer_manager.start_profile(
|
304
|
+
obj.output_dir, obj.num_steps, obj.activities
|
305
|
+
)
|
241
306
|
return Response(
|
242
307
|
content="Start profiling.\n",
|
243
308
|
status_code=200,
|
@@ -256,11 +321,15 @@ async def stop_profile_async():
|
|
256
321
|
|
257
322
|
@app.post("/update_weights_from_disk")
|
258
323
|
async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
|
259
|
-
"""Update the weights from disk
|
260
|
-
success, message =
|
261
|
-
obj, request
|
324
|
+
"""Update the weights from disk inplace without re-launching the server."""
|
325
|
+
success, message, num_paused_requests = (
|
326
|
+
await _global_state.tokenizer_manager.update_weights_from_disk(obj, request)
|
262
327
|
)
|
263
|
-
content = {
|
328
|
+
content = {
|
329
|
+
"success": success,
|
330
|
+
"message": message,
|
331
|
+
"num_paused_requests": num_paused_requests,
|
332
|
+
}
|
264
333
|
if success:
|
265
334
|
return ORJSONResponse(
|
266
335
|
content,
|
@@ -322,7 +391,7 @@ async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
|
|
322
391
|
async def release_memory_occupation(
|
323
392
|
obj: ReleaseMemoryOccupationReqInput, request: Request
|
324
393
|
):
|
325
|
-
"""Release GPU occupation temporarily"""
|
394
|
+
"""Release GPU memory occupation temporarily."""
|
326
395
|
try:
|
327
396
|
await _global_state.tokenizer_manager.release_memory_occupation(obj, request)
|
328
397
|
except Exception as e:
|
@@ -333,7 +402,7 @@ async def release_memory_occupation(
|
|
333
402
|
async def resume_memory_occupation(
|
334
403
|
obj: ResumeMemoryOccupationReqInput, request: Request
|
335
404
|
):
|
336
|
-
"""Resume GPU occupation"""
|
405
|
+
"""Resume GPU memory occupation."""
|
337
406
|
try:
|
338
407
|
await _global_state.tokenizer_manager.resume_memory_occupation(obj, request)
|
339
408
|
except Exception as e:
|
@@ -356,7 +425,7 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
|
|
356
425
|
|
357
426
|
@app.api_route("/close_session", methods=["GET", "POST"])
|
358
427
|
async def close_session(obj: CloseSessionReqInput, request: Request):
|
359
|
-
"""Close the session"""
|
428
|
+
"""Close the session."""
|
360
429
|
try:
|
361
430
|
await _global_state.tokenizer_manager.close_session(obj, request)
|
362
431
|
return Response(status_code=200)
|
@@ -366,13 +435,13 @@ async def close_session(obj: CloseSessionReqInput, request: Request):
|
|
366
435
|
|
367
436
|
@app.api_route("/configure_logging", methods=["GET", "POST"])
|
368
437
|
async def configure_logging(obj: ConfigureLoggingReq, request: Request):
|
369
|
-
"""
|
438
|
+
"""Configure the request logging options."""
|
370
439
|
_global_state.tokenizer_manager.configure_logging(obj)
|
371
440
|
return Response(status_code=200)
|
372
441
|
|
373
442
|
|
374
|
-
@app.post("/
|
375
|
-
async def
|
443
|
+
@app.post("/parse_function_call")
|
444
|
+
async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
|
376
445
|
"""
|
377
446
|
A native API endpoint to parse function calls from a text.
|
378
447
|
"""
|
@@ -393,6 +462,26 @@ async def function_call_request(obj: FunctionCallReqInput, request: Request):
|
|
393
462
|
return ORJSONResponse(content=response_data, status_code=200)
|
394
463
|
|
395
464
|
|
465
|
+
@app.post("/separate_reasoning")
|
466
|
+
async def separate_reasoning_request(obj: SeparateReasoningReqInput, request: Request):
|
467
|
+
"""
|
468
|
+
A native API endpoint to separate reasoning from a text.
|
469
|
+
"""
|
470
|
+
# 1) Initialize the parser based on the request body
|
471
|
+
parser = ReasoningParser(model_type=obj.reasoning_parser)
|
472
|
+
|
473
|
+
# 2) Call the non-stream parsing method (non-stream)
|
474
|
+
reasoning_text, normal_text = parser.parse_non_stream(obj.text)
|
475
|
+
|
476
|
+
# 3) Organize the response content
|
477
|
+
response_data = {
|
478
|
+
"reasoning_text": reasoning_text,
|
479
|
+
"text": normal_text,
|
480
|
+
}
|
481
|
+
|
482
|
+
return ORJSONResponse(content=response_data, status_code=200)
|
483
|
+
|
484
|
+
|
396
485
|
##### OpenAI-compatible API endpoints #####
|
397
486
|
|
398
487
|
|
@@ -425,7 +514,7 @@ def available_models():
|
|
425
514
|
@app.post("/v1/files")
|
426
515
|
async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
|
427
516
|
return await v1_files_create(
|
428
|
-
file, purpose, _global_state.tokenizer_manager.server_args.
|
517
|
+
file, purpose, _global_state.tokenizer_manager.server_args.file_storage_path
|
429
518
|
)
|
430
519
|
|
431
520
|
|
@@ -463,6 +552,44 @@ async def retrieve_file_content(file_id: str):
|
|
463
552
|
return await v1_retrieve_file_content(file_id)
|
464
553
|
|
465
554
|
|
555
|
+
## SageMaker API
|
556
|
+
@app.get("/ping")
|
557
|
+
async def sagemaker_health() -> Response:
|
558
|
+
"""Check the health of the http server."""
|
559
|
+
return Response(status_code=200)
|
560
|
+
|
561
|
+
|
562
|
+
@app.post("/invocations")
|
563
|
+
async def sagemaker_chat_completions(raw_request: Request):
|
564
|
+
return await v1_chat_completions(_global_state.tokenizer_manager, raw_request)
|
565
|
+
|
566
|
+
|
567
|
+
## Vertex AI API
|
568
|
+
@app.post(os.environ.get("AIP_PREDICT_ROUTE", "/vertex_generate"))
|
569
|
+
async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Request):
|
570
|
+
if not vertex_req.instances:
|
571
|
+
return []
|
572
|
+
inputs = {}
|
573
|
+
for input_key in ("text", "input_ids", "input_embeds"):
|
574
|
+
if vertex_req.instances[0].get(input_key):
|
575
|
+
inputs[input_key] = [
|
576
|
+
instance.get(input_key) for instance in vertex_req.instances
|
577
|
+
]
|
578
|
+
break
|
579
|
+
image_data = [
|
580
|
+
instance.get("image_data")
|
581
|
+
for instance in vertex_req.instances
|
582
|
+
if instance.get("image_data") is not None
|
583
|
+
] or None
|
584
|
+
req = GenerateReqInput(
|
585
|
+
**inputs,
|
586
|
+
image_data=image_data,
|
587
|
+
**(vertex_req.parameters or {}),
|
588
|
+
)
|
589
|
+
ret = await generate_request(req, raw_request)
|
590
|
+
return ORJSONResponse({"predictions": ret})
|
591
|
+
|
592
|
+
|
466
593
|
def _create_error_response(e):
|
467
594
|
return ORJSONResponse(
|
468
595
|
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
@@ -472,6 +599,7 @@ def _create_error_response(e):
|
|
472
599
|
def launch_server(
|
473
600
|
server_args: ServerArgs,
|
474
601
|
pipe_finish_writer: Optional[multiprocessing.connection.Connection] = None,
|
602
|
+
launch_callback: Optional[Callable[[], None]] = None,
|
475
603
|
):
|
476
604
|
"""
|
477
605
|
Launch SRT (SGLang Runtime) Server.
|
@@ -505,21 +633,23 @@ def launch_server(
|
|
505
633
|
add_prometheus_middleware(app)
|
506
634
|
enable_func_timer()
|
507
635
|
|
508
|
-
# Send a warmup request
|
509
|
-
|
636
|
+
# Send a warmup request - we will create the thread launch it
|
637
|
+
# in the lifespan after all other warmups have fired.
|
638
|
+
warmup_thread = threading.Thread(
|
510
639
|
target=_wait_and_warmup,
|
511
640
|
args=(
|
512
641
|
server_args,
|
513
642
|
pipe_finish_writer,
|
514
643
|
_global_state.tokenizer_manager.image_token_id,
|
644
|
+
launch_callback,
|
515
645
|
),
|
516
646
|
)
|
517
|
-
|
647
|
+
app.warmup_thread = warmup_thread
|
518
648
|
|
519
649
|
try:
|
520
650
|
# Update logging configs
|
521
651
|
set_uvicorn_logging_configs()
|
522
|
-
|
652
|
+
app.server_args = server_args
|
523
653
|
# Listen for HTTP requests
|
524
654
|
uvicorn.run(
|
525
655
|
app,
|
@@ -530,10 +660,15 @@ def launch_server(
|
|
530
660
|
loop="uvloop",
|
531
661
|
)
|
532
662
|
finally:
|
533
|
-
|
663
|
+
warmup_thread.join()
|
534
664
|
|
535
665
|
|
536
|
-
def _wait_and_warmup(
|
666
|
+
def _wait_and_warmup(
|
667
|
+
server_args: ServerArgs,
|
668
|
+
pipe_finish_writer: Optional[multiprocessing.connection.Connection],
|
669
|
+
image_token_text: str,
|
670
|
+
launch_callback: Optional[Callable[[], None]] = None,
|
671
|
+
):
|
537
672
|
headers = {}
|
538
673
|
url = server_args.url()
|
539
674
|
if server_args.api_key:
|
@@ -575,8 +710,16 @@ def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text):
|
|
575
710
|
else:
|
576
711
|
json_data["text"] = "The capital city of France is"
|
577
712
|
|
713
|
+
# Debug dumping
|
714
|
+
if server_args.debug_tensor_dump_input_file:
|
715
|
+
json_data.pop("text", None)
|
716
|
+
json_data["input_ids"] = np.load(
|
717
|
+
server_args.debug_tensor_dump_input_file
|
718
|
+
).tolist()
|
719
|
+
json_data["sampling_params"]["max_new_tokens"] = 0
|
720
|
+
|
578
721
|
try:
|
579
|
-
for
|
722
|
+
for i in range(server_args.dp_size):
|
580
723
|
res = requests.post(
|
581
724
|
url + request_name,
|
582
725
|
json=json_data,
|
@@ -601,3 +744,9 @@ def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text):
|
|
601
744
|
|
602
745
|
if server_args.delete_ckpt_after_loading:
|
603
746
|
delete_directory(server_args.model_path)
|
747
|
+
|
748
|
+
if server_args.debug_tensor_dump_input_file:
|
749
|
+
kill_process_tree(os.getpid())
|
750
|
+
|
751
|
+
if launch_callback is not None:
|
752
|
+
launch_callback()
|