sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +208 -295
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +9 -6
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +143 -6
- sglang/srt/managers/schedule_batch.py +238 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +681 -259
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +224 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +44 -18
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +94 -36
- sglang/srt/model_executor/cuda_graph_runner.py +55 -24
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +209 -28
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -29
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +136 -52
- sglang/srt/speculative/build_eagle_tree.py +2 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
- sglang/srt/speculative/eagle_utils.py +92 -58
- sglang/srt/speculative/eagle_worker.py +186 -94
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +214 -166
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py
CHANGED
@@ -8,10 +8,11 @@ import random
|
|
8
8
|
import subprocess
|
9
9
|
import threading
|
10
10
|
import time
|
11
|
+
import unittest
|
11
12
|
from concurrent.futures import ThreadPoolExecutor
|
12
13
|
from functools import partial
|
13
14
|
from types import SimpleNamespace
|
14
|
-
from typing import Callable, List, Optional
|
15
|
+
from typing import Callable, List, Optional, Tuple
|
15
16
|
|
16
17
|
import numpy as np
|
17
18
|
import requests
|
@@ -34,6 +35,7 @@ DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
|
34
35
|
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
35
36
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
36
37
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
38
|
+
DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
|
37
39
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
|
38
40
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
39
41
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
@@ -41,9 +43,14 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
|
|
41
43
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
42
44
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
43
45
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
46
|
+
DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
|
47
|
+
|
44
48
|
|
45
49
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
46
|
-
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "
|
50
|
+
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
51
|
+
|
52
|
+
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
|
53
|
+
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
|
47
54
|
|
48
55
|
|
49
56
|
def is_in_ci():
|
@@ -158,45 +165,6 @@ def call_generate_guidance(
|
|
158
165
|
return rets if n > 1 else rets[0]
|
159
166
|
|
160
167
|
|
161
|
-
async def call_generate_lmql(
|
162
|
-
prompt, temperature, max_tokens, stop=None, n=1, max_len=4096, model=None, **kwargs
|
163
|
-
):
|
164
|
-
assert model is not None
|
165
|
-
import lmql
|
166
|
-
|
167
|
-
if stop != None:
|
168
|
-
|
169
|
-
@lmql.query(model=model)
|
170
|
-
async def program(question, max_tokens, stop):
|
171
|
-
'''lmql
|
172
|
-
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < max_tokens and STOPS_AT(ANSWER, stop)
|
173
|
-
return ANSWER
|
174
|
-
'''
|
175
|
-
|
176
|
-
else:
|
177
|
-
|
178
|
-
@lmql.query(model=model)
|
179
|
-
async def program(question, max_tokens):
|
180
|
-
'''lmql
|
181
|
-
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < max_tokens
|
182
|
-
return ANSWER
|
183
|
-
'''
|
184
|
-
|
185
|
-
tasks = [
|
186
|
-
program(
|
187
|
-
question=prompt,
|
188
|
-
temperature=temperature,
|
189
|
-
max_tokens=max_tokens,
|
190
|
-
stop=stop,
|
191
|
-
max_len=max_len,
|
192
|
-
**kwargs,
|
193
|
-
)
|
194
|
-
for _ in range(n)
|
195
|
-
]
|
196
|
-
rets = await asyncio.gather(*tasks)
|
197
|
-
return rets if n > 1 else rets[0]
|
198
|
-
|
199
|
-
|
200
168
|
def call_select_lightllm(context, choices, url=None):
|
201
169
|
assert url is not None
|
202
170
|
|
@@ -246,23 +214,6 @@ def call_select_guidance(context, choices, model=None):
|
|
246
214
|
return choices.index(out["answer"])
|
247
215
|
|
248
216
|
|
249
|
-
async def call_select_lmql(context, choices, temperature=0, max_len=4096, model=None):
|
250
|
-
assert model is not None
|
251
|
-
import lmql
|
252
|
-
|
253
|
-
@lmql.query(model=model)
|
254
|
-
async def program(ctx, choices):
|
255
|
-
'''lmql
|
256
|
-
"""{ctx}[ANSWER]""" where ANSWER in set(choices)
|
257
|
-
return ANSWER
|
258
|
-
'''
|
259
|
-
|
260
|
-
answer = await program(
|
261
|
-
ctx=context, choices=choices, temperature=temperature, max_len=max_len
|
262
|
-
)
|
263
|
-
return choices.index(answer)
|
264
|
-
|
265
|
-
|
266
217
|
def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
267
218
|
parser.add_argument("--parallel", type=int, default=64)
|
268
219
|
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
@@ -277,7 +228,6 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
|
277
228
|
"lightllm",
|
278
229
|
"gserver",
|
279
230
|
"guidance",
|
280
|
-
"lmql",
|
281
231
|
"srt-raw",
|
282
232
|
"llama.cpp",
|
283
233
|
],
|
@@ -294,7 +244,6 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
|
294
244
|
"vllm": 21000,
|
295
245
|
"outlines": 21000,
|
296
246
|
"lightllm": 22000,
|
297
|
-
"lmql": 23000,
|
298
247
|
"srt-raw": 30000,
|
299
248
|
"gserver": 9988,
|
300
249
|
}
|
@@ -342,11 +291,6 @@ def _get_call_generate(args: argparse.Namespace):
|
|
342
291
|
call_generate = partial(call_generate_guidance, model=model)
|
343
292
|
call_generate("Hello,", 1.0, 8, ".")
|
344
293
|
return call_generate
|
345
|
-
elif args.backend == "lmql":
|
346
|
-
import lmql
|
347
|
-
|
348
|
-
model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
|
349
|
-
return partial(call_generate_lmql, model=model)
|
350
294
|
else:
|
351
295
|
raise ValueError(f"Invalid backend: {args.backend}")
|
352
296
|
|
@@ -364,12 +308,6 @@ def _get_call_select(args: argparse.Namespace):
|
|
364
308
|
|
365
309
|
call_select("Hello,", ["world", "earth"])
|
366
310
|
return call_select
|
367
|
-
|
368
|
-
elif args.backend == "lmql":
|
369
|
-
import lmql
|
370
|
-
|
371
|
-
model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
|
372
|
-
return partial(call_select_lmql, model=model)
|
373
311
|
else:
|
374
312
|
raise ValueError(f"Invalid backend: {args.backend}")
|
375
313
|
|
@@ -408,26 +346,49 @@ def popen_launch_server(
|
|
408
346
|
other_args: list[str] = (),
|
409
347
|
env: Optional[dict] = None,
|
410
348
|
return_stdout_stderr: Optional[tuple] = None,
|
349
|
+
pd_seperated: bool = False,
|
411
350
|
):
|
412
351
|
_, host, port = base_url.split(":")
|
413
352
|
host = host[2:]
|
414
353
|
|
354
|
+
if pd_seperated:
|
355
|
+
command = "sglang.launch_pd_server"
|
356
|
+
else:
|
357
|
+
command = "sglang.launch_server"
|
358
|
+
|
415
359
|
command = [
|
416
360
|
"python3",
|
417
361
|
"-m",
|
418
|
-
|
362
|
+
command,
|
419
363
|
"--model-path",
|
420
364
|
model,
|
421
|
-
|
422
|
-
host,
|
423
|
-
"--port",
|
424
|
-
port,
|
425
|
-
*other_args,
|
365
|
+
*[str(x) for x in other_args],
|
426
366
|
]
|
427
367
|
|
368
|
+
if pd_seperated:
|
369
|
+
command.extend(
|
370
|
+
[
|
371
|
+
"--lb-host",
|
372
|
+
host,
|
373
|
+
"--lb-port",
|
374
|
+
port,
|
375
|
+
]
|
376
|
+
)
|
377
|
+
else:
|
378
|
+
command.extend(
|
379
|
+
[
|
380
|
+
"--host",
|
381
|
+
host,
|
382
|
+
"--port",
|
383
|
+
port,
|
384
|
+
]
|
385
|
+
)
|
386
|
+
|
428
387
|
if api_key:
|
429
388
|
command += ["--api-key", api_key]
|
430
389
|
|
390
|
+
print(f"command={' '.join(command)}")
|
391
|
+
|
431
392
|
if return_stdout_stderr:
|
432
393
|
process = subprocess.Popen(
|
433
394
|
command,
|
@@ -456,6 +417,8 @@ def popen_launch_server(
|
|
456
417
|
except requests.RequestException:
|
457
418
|
pass
|
458
419
|
time.sleep(10)
|
420
|
+
|
421
|
+
kill_process_tree(process.pid)
|
459
422
|
raise TimeoutError("Server failed to start within the timeout period.")
|
460
423
|
|
461
424
|
|
@@ -488,9 +451,11 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
488
451
|
success = True
|
489
452
|
|
490
453
|
for filename in files:
|
491
|
-
|
454
|
+
process = None
|
492
455
|
|
493
456
|
def run_one_file(filename):
|
457
|
+
nonlocal process
|
458
|
+
|
494
459
|
filename = os.path.join(os.getcwd(), filename)
|
495
460
|
print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
|
496
461
|
process = subprocess.Popen(
|
@@ -503,7 +468,9 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
503
468
|
ret_code = run_with_timeout(
|
504
469
|
run_one_file, args=(filename,), timeout=timeout_per_file
|
505
470
|
)
|
506
|
-
assert
|
471
|
+
assert (
|
472
|
+
ret_code == 0
|
473
|
+
), f"expected return code 0, but {filename} returned {ret_code}"
|
507
474
|
except TimeoutError:
|
508
475
|
kill_process_tree(process.pid)
|
509
476
|
time.sleep(5)
|
@@ -532,11 +499,15 @@ def get_benchmark_args(
|
|
532
499
|
dataset_path="",
|
533
500
|
tokenizer="",
|
534
501
|
num_prompts=500,
|
502
|
+
sharegpt_output_len=None,
|
535
503
|
random_input_len=4096,
|
536
504
|
random_output_len=2048,
|
505
|
+
sharegpt_context_len=None,
|
537
506
|
request_rate=float("inf"),
|
538
507
|
disable_stream=False,
|
539
508
|
disable_ignore_eos=False,
|
509
|
+
seed: int = 0,
|
510
|
+
pd_seperated: bool = False,
|
540
511
|
):
|
541
512
|
return SimpleNamespace(
|
542
513
|
backend="sglang",
|
@@ -548,8 +519,8 @@ def get_benchmark_args(
|
|
548
519
|
model=None,
|
549
520
|
tokenizer=tokenizer,
|
550
521
|
num_prompts=num_prompts,
|
551
|
-
sharegpt_output_len=
|
552
|
-
sharegpt_context_len=
|
522
|
+
sharegpt_output_len=sharegpt_output_len,
|
523
|
+
sharegpt_context_len=sharegpt_context_len,
|
553
524
|
random_input_len=random_input_len,
|
554
525
|
random_output_len=random_output_len,
|
555
526
|
random_range_ratio=0.0,
|
@@ -559,12 +530,14 @@ def get_benchmark_args(
|
|
559
530
|
disable_tqdm=False,
|
560
531
|
disable_stream=disable_stream,
|
561
532
|
return_logprob=False,
|
562
|
-
seed=
|
533
|
+
seed=seed,
|
563
534
|
disable_ignore_eos=disable_ignore_eos,
|
564
535
|
extra_request_body=None,
|
565
536
|
apply_chat_template=False,
|
566
537
|
profile=None,
|
567
538
|
lora_name=None,
|
539
|
+
prompt_suffix="",
|
540
|
+
pd_seperated=pd_seperated,
|
568
541
|
)
|
569
542
|
|
570
543
|
|
@@ -578,9 +551,11 @@ def run_bench_serving(
|
|
578
551
|
tokenizer=None,
|
579
552
|
random_input_len=4096,
|
580
553
|
random_output_len=2048,
|
554
|
+
sharegpt_context_len=None,
|
581
555
|
disable_stream=False,
|
582
556
|
disable_ignore_eos=False,
|
583
557
|
need_warmup=False,
|
558
|
+
seed: int = 0,
|
584
559
|
):
|
585
560
|
# Launch the server
|
586
561
|
base_url = DEFAULT_URL_FOR_TEST
|
@@ -600,9 +575,11 @@ def run_bench_serving(
|
|
600
575
|
num_prompts=num_prompts,
|
601
576
|
random_input_len=random_input_len,
|
602
577
|
random_output_len=random_output_len,
|
578
|
+
sharegpt_context_len=sharegpt_context_len,
|
603
579
|
request_rate=request_rate,
|
604
580
|
disable_stream=disable_stream,
|
605
581
|
disable_ignore_eos=disable_ignore_eos,
|
582
|
+
seed=seed,
|
606
583
|
)
|
607
584
|
|
608
585
|
try:
|
@@ -624,6 +601,7 @@ def run_bench_serving_multi(
|
|
624
601
|
other_server_args,
|
625
602
|
benchmark_args,
|
626
603
|
need_warmup=False,
|
604
|
+
pd_seperated=False,
|
627
605
|
):
|
628
606
|
# Launch the server
|
629
607
|
process = popen_launch_server(
|
@@ -631,6 +609,7 @@ def run_bench_serving_multi(
|
|
631
609
|
base_url,
|
632
610
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
633
611
|
other_args=other_server_args,
|
612
|
+
pd_seperated=pd_seperated,
|
634
613
|
)
|
635
614
|
|
636
615
|
# run benchmark for all
|
@@ -663,7 +642,7 @@ def run_bench_one_batch(model, other_args):
|
|
663
642
|
"128",
|
664
643
|
"--output",
|
665
644
|
"8",
|
666
|
-
*other_args,
|
645
|
+
*[str(x) for x in other_args],
|
667
646
|
]
|
668
647
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
669
648
|
|
@@ -814,7 +793,7 @@ def run_command_and_capture_output(command, env: Optional[dict] = None):
|
|
814
793
|
stdout = open(STDOUT_FILENAME, "w")
|
815
794
|
stderr = open(STDERR_FILENAME, "w")
|
816
795
|
process = subprocess.Popen(
|
817
|
-
command, stdout=stdout, stderr=
|
796
|
+
command, stdout=stdout, stderr=stdout, env=env, text=True
|
818
797
|
)
|
819
798
|
|
820
799
|
# Launch a thread to stream the output
|
@@ -912,3 +891,78 @@ def run_mulit_request_test(
|
|
912
891
|
def write_github_step_summary(content):
|
913
892
|
with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
|
914
893
|
f.write(content)
|
894
|
+
|
895
|
+
|
896
|
+
def run_logprob_check(self: unittest.TestCase, arg: Tuple):
|
897
|
+
(
|
898
|
+
input_len,
|
899
|
+
output_len,
|
900
|
+
temperature,
|
901
|
+
logprob_start_len,
|
902
|
+
return_logprob,
|
903
|
+
top_logprobs_num,
|
904
|
+
) = arg
|
905
|
+
input_ids = list(range(input_len))
|
906
|
+
|
907
|
+
response = requests.post(
|
908
|
+
self.base_url + "/generate",
|
909
|
+
json={
|
910
|
+
"input_ids": input_ids,
|
911
|
+
"sampling_params": {
|
912
|
+
"temperature": temperature,
|
913
|
+
"max_new_tokens": output_len,
|
914
|
+
"ignore_eos": True,
|
915
|
+
},
|
916
|
+
"return_logprob": return_logprob,
|
917
|
+
"logprob_start_len": logprob_start_len,
|
918
|
+
"top_logprobs_num": top_logprobs_num,
|
919
|
+
},
|
920
|
+
)
|
921
|
+
response_json = response.json()
|
922
|
+
|
923
|
+
res = response_json
|
924
|
+
self.assertEqual(res["meta_info"]["prompt_tokens"], input_len)
|
925
|
+
self.assertEqual(res["meta_info"]["completion_tokens"], output_len)
|
926
|
+
|
927
|
+
# Test the number of tokens are correct
|
928
|
+
if return_logprob:
|
929
|
+
self.assertEqual(
|
930
|
+
len(res["meta_info"]["input_token_logprobs"]) + logprob_start_len,
|
931
|
+
res["meta_info"]["prompt_tokens"],
|
932
|
+
)
|
933
|
+
self.assertEqual(len(res["meta_info"]["output_token_logprobs"]), output_len)
|
934
|
+
|
935
|
+
if top_logprobs_num:
|
936
|
+
self.assertEqual(
|
937
|
+
len(res["meta_info"]["input_top_logprobs"]) + logprob_start_len,
|
938
|
+
res["meta_info"]["prompt_tokens"],
|
939
|
+
)
|
940
|
+
self.assertEqual(len(res["meta_info"]["output_top_logprobs"]), output_len)
|
941
|
+
|
942
|
+
for i in range(output_len):
|
943
|
+
self.assertEqual(
|
944
|
+
len(res["meta_info"]["output_top_logprobs"][i]),
|
945
|
+
top_logprobs_num,
|
946
|
+
)
|
947
|
+
|
948
|
+
# Test the top-1 tokens are the same as output tokens if temperature == 0
|
949
|
+
if temperature == 0:
|
950
|
+
rank = 0
|
951
|
+
while rank < len(res["meta_info"]["output_top_logprobs"][i]):
|
952
|
+
try:
|
953
|
+
self.assertListEqual(
|
954
|
+
res["meta_info"]["output_token_logprobs"][i],
|
955
|
+
res["meta_info"]["output_top_logprobs"][i][rank],
|
956
|
+
)
|
957
|
+
break
|
958
|
+
except AssertionError:
|
959
|
+
# There's a tie. Allow the second item in this case.
|
960
|
+
if (
|
961
|
+
res["meta_info"]["output_top_logprobs"][i][rank][0]
|
962
|
+
== res["meta_info"]["output_top_logprobs"][i][rank + 1][
|
963
|
+
0
|
964
|
+
]
|
965
|
+
):
|
966
|
+
rank += 1
|
967
|
+
else:
|
968
|
+
raise
|
sglang/utils.py
CHANGED
@@ -5,12 +5,15 @@ import importlib
|
|
5
5
|
import json
|
6
6
|
import logging
|
7
7
|
import os
|
8
|
+
import random
|
8
9
|
import signal
|
10
|
+
import socket
|
9
11
|
import subprocess
|
10
12
|
import sys
|
11
13
|
import time
|
12
14
|
import traceback
|
13
15
|
import urllib.request
|
16
|
+
import weakref
|
14
17
|
from concurrent.futures import ThreadPoolExecutor
|
15
18
|
from io import BytesIO
|
16
19
|
from json import dumps
|
@@ -21,8 +24,14 @@ import requests
|
|
21
24
|
from IPython.display import HTML, display
|
22
25
|
from tqdm import tqdm
|
23
26
|
|
27
|
+
from sglang.srt.openai_api.protocol import ChatCompletionMessageContentPart
|
28
|
+
from sglang.srt.utils import kill_process_tree
|
29
|
+
|
24
30
|
logger = logging.getLogger(__name__)
|
25
31
|
|
32
|
+
# type of content fields, can be only prompts or with images/videos
|
33
|
+
MsgContent = Union[str, List[ChatCompletionMessageContentPart]]
|
34
|
+
|
26
35
|
|
27
36
|
def get_exception_traceback():
|
28
37
|
etype, value, tb = sys.exc_info()
|
@@ -306,27 +315,12 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
|
|
306
315
|
return filename
|
307
316
|
|
308
317
|
|
309
|
-
import fcntl
|
310
|
-
|
311
|
-
|
312
318
|
def is_in_ci():
|
313
319
|
from sglang.test.test_utils import is_in_ci
|
314
320
|
|
315
321
|
return is_in_ci()
|
316
322
|
|
317
323
|
|
318
|
-
LOCKFILE = os.path.expanduser("~/.sglang_port_lock")
|
319
|
-
PORT_REGISTRY = os.path.expanduser("~/.sglang_port_registry.json")
|
320
|
-
|
321
|
-
if not os.path.exists(LOCKFILE):
|
322
|
-
with open(LOCKFILE, "w") as f:
|
323
|
-
pass
|
324
|
-
|
325
|
-
if not os.path.exists(PORT_REGISTRY):
|
326
|
-
with open(PORT_REGISTRY, "w") as f:
|
327
|
-
json.dump([], f)
|
328
|
-
|
329
|
-
|
330
324
|
def print_highlight(html_content: str):
|
331
325
|
if is_in_ci():
|
332
326
|
html_content = str(html_content).replace("\n", "<br>")
|
@@ -335,55 +329,44 @@ def print_highlight(html_content: str):
|
|
335
329
|
print(html_content)
|
336
330
|
|
337
331
|
|
338
|
-
|
339
|
-
"""Initialize the port registry file if it doesn't exist."""
|
340
|
-
if not os.path.exists(PORT_REGISTRY):
|
341
|
-
with open(PORT_REGISTRY, "w") as f:
|
342
|
-
json.dump([], f)
|
332
|
+
process_socket_map = weakref.WeakKeyDictionary()
|
343
333
|
|
344
334
|
|
345
|
-
def reserve_port(start=30000, end=40000):
|
335
|
+
def reserve_port(host, start=30000, end=40000):
|
346
336
|
"""
|
347
|
-
Reserve an available port
|
348
|
-
Returns
|
337
|
+
Reserve an available port by trying to bind a socket.
|
338
|
+
Returns a tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
|
349
339
|
"""
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
except Exception:
|
357
|
-
used = []
|
358
|
-
for port in range(start, end):
|
359
|
-
if port not in used:
|
360
|
-
used.append(port)
|
361
|
-
with open(PORT_REGISTRY, "w") as f:
|
362
|
-
json.dump(used, f)
|
363
|
-
return port
|
364
|
-
raise RuntimeError("No free port available")
|
365
|
-
|
366
|
-
|
367
|
-
def release_port(port):
|
368
|
-
"""Release the reserved port by removing it from the registry."""
|
369
|
-
with open(LOCKFILE, "w") as lock:
|
370
|
-
fcntl.flock(lock, fcntl.LOCK_EX)
|
340
|
+
candidates = list(range(start, end))
|
341
|
+
random.shuffle(candidates)
|
342
|
+
|
343
|
+
for port in candidates:
|
344
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
345
|
+
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
371
346
|
try:
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
347
|
+
# Attempt to bind to the port on localhost
|
348
|
+
sock.bind((host, port))
|
349
|
+
return port, sock
|
350
|
+
except socket.error:
|
351
|
+
sock.close() # Failed to bind, try next port
|
352
|
+
continue
|
353
|
+
raise RuntimeError("No free port available.")
|
354
|
+
|
355
|
+
|
356
|
+
def release_port(lock_socket):
|
357
|
+
"""
|
358
|
+
Release the reserved port by closing the lock socket.
|
359
|
+
"""
|
360
|
+
try:
|
361
|
+
lock_socket.close()
|
362
|
+
except Exception as e:
|
363
|
+
print(f"Error closing socket: {e}")
|
380
364
|
|
381
365
|
|
382
366
|
def execute_shell_command(command: str) -> subprocess.Popen:
|
383
367
|
"""
|
384
368
|
Execute a shell command and return its process handle.
|
385
369
|
"""
|
386
|
-
# Replace newline continuations and split the command string.
|
387
370
|
command = command.replace("\\\n", " ").replace("\\", " ")
|
388
371
|
parts = command.split()
|
389
372
|
return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
|
@@ -395,21 +378,28 @@ def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
|
|
395
378
|
If no port is specified, a free port is reserved.
|
396
379
|
"""
|
397
380
|
if port is None:
|
398
|
-
port = reserve_port()
|
381
|
+
port, lock_socket = reserve_port(host)
|
382
|
+
else:
|
383
|
+
lock_socket = None
|
384
|
+
|
399
385
|
full_command = f"{command} --port {port}"
|
400
386
|
process = execute_shell_command(full_command)
|
387
|
+
|
388
|
+
if lock_socket is not None:
|
389
|
+
process_socket_map[process] = lock_socket
|
390
|
+
|
401
391
|
return process, port
|
402
392
|
|
403
393
|
|
404
|
-
def terminate_process(process
|
394
|
+
def terminate_process(process):
|
405
395
|
"""
|
406
|
-
Terminate the process and
|
396
|
+
Terminate the process and automatically release the reserved port.
|
407
397
|
"""
|
408
|
-
from sglang.srt.utils import kill_process_tree
|
409
|
-
|
410
398
|
kill_process_tree(process.pid)
|
411
|
-
|
412
|
-
|
399
|
+
|
400
|
+
lock_socket = process_socket_map.pop(process, None)
|
401
|
+
if lock_socket is not None:
|
402
|
+
release_port(lock_socket)
|
413
403
|
|
414
404
|
|
415
405
|
def wait_for_server(base_url: str, timeout: int = None) -> None:
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.3.
|
1
|
+
__version__ = "0.4.3.post3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.3.
|
3
|
+
Version: 0.4.3.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -235,32 +235,34 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
235
235
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
236
236
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
237
237
|
Requires-Dist: uvloop; extra == "runtime-common"
|
238
|
-
Requires-Dist: xgrammar==0.1.
|
238
|
+
Requires-Dist: xgrammar==0.1.14; extra == "runtime-common"
|
239
239
|
Requires-Dist: ninja; extra == "runtime-common"
|
240
|
+
Requires-Dist: transformers==4.48.3; extra == "runtime-common"
|
241
|
+
Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
|
240
242
|
Provides-Extra: srt
|
241
243
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
242
|
-
Requires-Dist:
|
243
|
-
Requires-Dist:
|
244
|
-
Requires-Dist: torch; extra == "srt"
|
244
|
+
Requires-Dist: sgl-kernel==0.0.3.post6; extra == "srt"
|
245
|
+
Requires-Dist: flashinfer_python==0.2.2.post1; extra == "srt"
|
246
|
+
Requires-Dist: torch==2.5.1; extra == "srt"
|
245
247
|
Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
|
246
|
-
Requires-Dist:
|
248
|
+
Requires-Dist: cuda-python; extra == "srt"
|
247
249
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
248
250
|
Provides-Extra: srt-hip
|
249
251
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
252
|
+
Requires-Dist: sgl-kernel==0.0.3.post6; extra == "srt-hip"
|
250
253
|
Requires-Dist: torch; extra == "srt-hip"
|
251
254
|
Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
|
252
255
|
Requires-Dist: outlines==0.1.11; extra == "srt-hip"
|
253
|
-
Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt-hip"
|
254
256
|
Provides-Extra: srt-xpu
|
255
257
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
256
|
-
Requires-Dist: outlines
|
258
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-xpu"
|
257
259
|
Provides-Extra: srt-hpu
|
258
260
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
259
|
-
Requires-Dist: outlines
|
261
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-hpu"
|
260
262
|
Provides-Extra: srt-cpu
|
261
263
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
264
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
|
262
265
|
Requires-Dist: torch; extra == "srt-cpu"
|
263
|
-
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-cpu"
|
264
266
|
Provides-Extra: openai
|
265
267
|
Requires-Dist: openai>=1.0; extra == "openai"
|
266
268
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -318,7 +320,7 @@ Provides-Extra: dev-cpu
|
|
318
320
|
Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
|
319
321
|
Requires-Dist: sglang[test]; extra == "dev-cpu"
|
320
322
|
|
321
|
-
<div align="center"
|
323
|
+
<div align="center" id="sglangtop">
|
322
324
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
323
325
|
|
324
326
|
[](https://pypi.org/project/sglang)
|
@@ -336,10 +338,11 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
|
|
336
338
|
| [**Documentation**](https://docs.sglang.ai/)
|
337
339
|
| [**Join Slack**](https://slack.sglang.ai/)
|
338
340
|
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
|
341
|
+
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
|
339
342
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
340
343
|
|
341
344
|
## News
|
342
|
-
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
|
345
|
+
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
343
346
|
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
344
347
|
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
345
348
|
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
@@ -366,7 +369,7 @@ The core features include:
|
|
366
369
|
|
367
370
|
## Getting Started
|
368
371
|
- [Install SGLang](https://docs.sglang.ai/start/install.html)
|
369
|
-
- [Quick Start](https://docs.sglang.ai/
|
372
|
+
- [Quick Start](https://docs.sglang.ai/backend/send_request.html)
|
370
373
|
- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
|
371
374
|
- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
|
372
375
|
- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
|
@@ -375,10 +378,13 @@ The core features include:
|
|
375
378
|
Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
376
379
|
|
377
380
|
## Roadmap
|
378
|
-
[Development Roadmap (
|
381
|
+
[Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
|
379
382
|
|
380
383
|
## Adoption and Sponsorship
|
381
|
-
The project
|
384
|
+
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
385
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
386
|
+
|
387
|
+
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
382
388
|
|
383
389
|
## Contact Us
|
384
390
|
|