sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +21 -23
- sglang/api.py +2 -7
- sglang/bench_offline_throughput.py +41 -27
- sglang/bench_one_batch.py +60 -4
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +83 -71
- sglang/lang/backend/runtime_endpoint.py +183 -4
- sglang/lang/chat_template.py +46 -4
- sglang/launch_server.py +1 -1
- sglang/srt/_custom_ops.py +80 -42
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constrained/base_grammar_backend.py +21 -0
- sglang/srt/constrained/xgrammar_backend.py +8 -4
- sglang/srt/conversation.py +14 -1
- sglang/srt/distributed/__init__.py +3 -3
- sglang/srt/distributed/communication_op.py +2 -1
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
- sglang/srt/distributed/device_communicators/pynccl.py +80 -1
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
- sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
- sglang/srt/distributed/parallel_state.py +1 -1
- sglang/srt/distributed/utils.py +2 -1
- sglang/srt/entrypoints/engine.py +452 -0
- sglang/srt/entrypoints/http_server.py +603 -0
- sglang/srt/function_call_parser.py +494 -0
- sglang/srt/layers/activation.py +8 -8
- sglang/srt/layers/attention/flashinfer_backend.py +10 -9
- sglang/srt/layers/attention/triton_backend.py +4 -6
- sglang/srt/layers/attention/vision.py +204 -0
- sglang/srt/layers/dp_attention.py +71 -0
- sglang/srt/layers/layernorm.py +5 -5
- sglang/srt/layers/linear.py +65 -14
- sglang/srt/layers/logits_processor.py +49 -64
- sglang/srt/layers/moe/ep_moe/layer.py +24 -16
- sglang/srt/layers/moe/fused_moe_native.py +84 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
- sglang/srt/layers/parameter.py +18 -8
- sglang/srt/layers/quantization/__init__.py +20 -23
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/fp8.py +10 -4
- sglang/srt/layers/quantization/modelopt_quant.py +1 -2
- sglang/srt/layers/quantization/w8a8_int8.py +1 -1
- sglang/srt/layers/radix_attention.py +2 -2
- sglang/srt/layers/rotary_embedding.py +1184 -31
- sglang/srt/layers/sampler.py +64 -6
- sglang/srt/layers/torchao_utils.py +12 -6
- sglang/srt/layers/vocab_parallel_embedding.py +2 -2
- sglang/srt/lora/lora.py +1 -9
- sglang/srt/managers/configure_logging.py +3 -0
- sglang/srt/managers/data_parallel_controller.py +79 -72
- sglang/srt/managers/detokenizer_manager.py +24 -6
- sglang/srt/managers/image_processor.py +158 -2
- sglang/srt/managers/io_struct.py +57 -3
- sglang/srt/managers/schedule_batch.py +78 -45
- sglang/srt/managers/schedule_policy.py +26 -12
- sglang/srt/managers/scheduler.py +326 -201
- sglang/srt/managers/session_controller.py +1 -0
- sglang/srt/managers/tokenizer_manager.py +210 -121
- sglang/srt/managers/tp_worker.py +6 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
- sglang/srt/managers/utils.py +44 -0
- sglang/srt/mem_cache/memory_pool.py +10 -32
- sglang/srt/metrics/collector.py +15 -6
- sglang/srt/model_executor/cuda_graph_runner.py +26 -30
- sglang/srt/model_executor/forward_batch_info.py +5 -7
- sglang/srt/model_executor/model_runner.py +44 -19
- sglang/srt/model_loader/loader.py +83 -6
- sglang/srt/model_loader/weight_utils.py +145 -6
- sglang/srt/models/baichuan.py +6 -6
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +17 -5
- sglang/srt/models/dbrx.py +13 -5
- sglang/srt/models/deepseek.py +3 -3
- sglang/srt/models/deepseek_v2.py +11 -11
- sglang/srt/models/exaone.py +2 -2
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +15 -25
- sglang/srt/models/gpt2.py +3 -5
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/granite.py +2 -2
- sglang/srt/models/grok.py +4 -3
- sglang/srt/models/internlm2.py +2 -2
- sglang/srt/models/llama.py +7 -5
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/minicpm3.py +9 -9
- sglang/srt/models/minicpmv.py +1238 -0
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mixtral_quant.py +3 -3
- sglang/srt/models/mllama.py +2 -2
- sglang/srt/models/olmo.py +3 -3
- sglang/srt/models/olmo2.py +4 -4
- sglang/srt/models/olmoe.py +7 -13
- sglang/srt/models/phi3_small.py +2 -2
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +41 -4
- sglang/srt/models/qwen2_moe.py +3 -3
- sglang/srt/models/qwen2_vl.py +22 -122
- sglang/srt/models/stablelm.py +2 -2
- sglang/srt/models/torch_native_llama.py +20 -7
- sglang/srt/models/xverse.py +6 -6
- sglang/srt/models/xverse_moe.py +6 -6
- sglang/srt/openai_api/adapter.py +139 -37
- sglang/srt/openai_api/protocol.py +7 -4
- sglang/srt/sampling/custom_logit_processor.py +38 -0
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
- sglang/srt/sampling/sampling_batch_info.py +143 -18
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +4 -1090
- sglang/srt/server_args.py +77 -15
- sglang/srt/speculative/eagle_utils.py +37 -15
- sglang/srt/speculative/eagle_worker.py +11 -13
- sglang/srt/utils.py +164 -129
- sglang/test/runners.py +8 -13
- sglang/test/test_programs.py +2 -1
- sglang/test/test_utils.py +83 -22
- sglang/utils.py +12 -2
- sglang/version.py +1 -1
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
- sglang/launch_server_llavavid.py +0 -25
- sglang/srt/constrained/__init__.py +0 -16
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
#
|
1
|
+
# SGLang public APIs
|
2
2
|
|
3
|
+
# Frontend Language APIs
|
3
4
|
from sglang.api import (
|
4
5
|
Engine,
|
5
6
|
Runtime,
|
@@ -23,16 +24,26 @@ from sglang.api import (
|
|
23
24
|
user_end,
|
24
25
|
video,
|
25
26
|
)
|
27
|
+
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
26
28
|
from sglang.lang.choices import (
|
27
29
|
greedy_token_selection,
|
28
30
|
token_length_normalized,
|
29
31
|
unconditional_likelihood_normalized,
|
30
32
|
)
|
33
|
+
from sglang.utils import LazyImport
|
34
|
+
|
35
|
+
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
36
|
+
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
37
|
+
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
38
|
+
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
39
|
+
|
40
|
+
# Other configs
|
41
|
+
from sglang.global_config import global_config
|
42
|
+
from sglang.version import __version__
|
31
43
|
|
32
|
-
# SGLang DSL APIs
|
33
44
|
__all__ = [
|
34
|
-
"Runtime",
|
35
45
|
"Engine",
|
46
|
+
"Runtime",
|
36
47
|
"assistant",
|
37
48
|
"assistant_begin",
|
38
49
|
"assistant_end",
|
@@ -52,27 +63,14 @@ __all__ = [
|
|
52
63
|
"user_begin",
|
53
64
|
"user_end",
|
54
65
|
"video",
|
66
|
+
"RuntimeEndpoint",
|
55
67
|
"greedy_token_selection",
|
56
68
|
"token_length_normalized",
|
57
69
|
"unconditional_likelihood_normalized",
|
70
|
+
"Anthropic",
|
71
|
+
"LiteLLM",
|
72
|
+
"OpenAI",
|
73
|
+
"VertexAI",
|
74
|
+
"global_config",
|
75
|
+
"__version__",
|
58
76
|
]
|
59
|
-
|
60
|
-
# Global Configurations
|
61
|
-
from sglang.global_config import global_config
|
62
|
-
|
63
|
-
__all__ += ["global_config"]
|
64
|
-
|
65
|
-
from sglang.version import __version__
|
66
|
-
|
67
|
-
__all__ += ["__version__"]
|
68
|
-
|
69
|
-
# SGLang Backends
|
70
|
-
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
71
|
-
from sglang.utils import LazyImport
|
72
|
-
|
73
|
-
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
74
|
-
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
75
|
-
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
76
|
-
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
77
|
-
|
78
|
-
__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
|
sglang/api.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
"""Public APIs of the language."""
|
2
2
|
|
3
|
-
import os
|
4
3
|
import re
|
5
4
|
from typing import Callable, List, Optional, Union
|
6
5
|
|
@@ -33,19 +32,15 @@ def function(
|
|
33
32
|
|
34
33
|
|
35
34
|
def Runtime(*args, **kwargs):
|
36
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
37
|
-
|
38
35
|
# Avoid importing unnecessary dependency
|
39
|
-
from sglang.
|
36
|
+
from sglang.lang.backend.runtime_endpoint import Runtime
|
40
37
|
|
41
38
|
return Runtime(*args, **kwargs)
|
42
39
|
|
43
40
|
|
44
41
|
def Engine(*args, **kwargs):
|
45
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
46
|
-
|
47
42
|
# Avoid importing unnecessary dependency
|
48
|
-
from sglang.srt.
|
43
|
+
from sglang.srt.entrypoints.engine import Engine
|
49
44
|
|
50
45
|
return Engine(*args, **kwargs)
|
51
46
|
|
@@ -27,7 +27,8 @@ from sglang.bench_serving import (
|
|
27
27
|
sample_random_requests,
|
28
28
|
set_ulimit,
|
29
29
|
)
|
30
|
-
from sglang.
|
30
|
+
from sglang.lang.backend.runtime_endpoint import Runtime
|
31
|
+
from sglang.srt.entrypoints.engine import Engine
|
31
32
|
from sglang.srt.server_args import ServerArgs
|
32
33
|
|
33
34
|
|
@@ -39,20 +40,22 @@ class BenchArgs:
|
|
39
40
|
dataset_path: str = ""
|
40
41
|
num_prompts: int = 1000
|
41
42
|
sharegpt_output_len: Optional[int] = None
|
43
|
+
sharegpt_context_len: Optional[int] = None
|
42
44
|
random_input_len: int = 1024
|
43
45
|
random_output_len: int = 1024
|
44
46
|
random_range_ratio: float = 0.0
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
gsp_num_groups: int = 64
|
48
|
+
gsp_prompts_per_group: int = 16
|
49
|
+
gsp_system_prompt_len: int = 2048
|
50
|
+
gsp_question_len: int = 128
|
51
|
+
gsp_output_len: int = 256
|
52
|
+
seed: int = 1
|
50
53
|
disable_ignore_eos: bool = False
|
51
54
|
extra_request_body: Optional[str] = None
|
52
|
-
|
55
|
+
apply_chat_template: bool = False
|
56
|
+
profile: bool = False
|
53
57
|
skip_warmup: bool = False
|
54
58
|
do_not_exit: bool = False
|
55
|
-
profile: bool = False
|
56
59
|
|
57
60
|
@staticmethod
|
58
61
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -82,6 +85,12 @@ class BenchArgs:
|
|
82
85
|
default=BenchArgs.sharegpt_output_len,
|
83
86
|
help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
|
84
87
|
)
|
88
|
+
parser.add_argument(
|
89
|
+
"--sharegpt-context-len",
|
90
|
+
type=int,
|
91
|
+
default=BenchArgs.sharegpt_context_len,
|
92
|
+
help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
|
93
|
+
)
|
85
94
|
parser.add_argument(
|
86
95
|
"--random-input-len",
|
87
96
|
type=int,
|
@@ -102,51 +111,62 @@ class BenchArgs:
|
|
102
111
|
"used only for random dataset.",
|
103
112
|
)
|
104
113
|
parser.add_argument(
|
105
|
-
"--
|
114
|
+
"--gsp-num-groups",
|
106
115
|
type=int,
|
107
|
-
default=BenchArgs.
|
116
|
+
default=BenchArgs.gsp_num_groups,
|
108
117
|
help="Number of groups with shared prefix, used"
|
109
118
|
"only for generate-shared-prefix",
|
110
119
|
)
|
111
120
|
parser.add_argument(
|
112
|
-
"--
|
121
|
+
"--gsp-prompts-per-group",
|
113
122
|
type=int,
|
114
|
-
default=BenchArgs.
|
123
|
+
default=BenchArgs.gsp_prompts_per_group,
|
115
124
|
help="Number of prompts per group of shared prefix, used"
|
116
125
|
"only for generate-shared-prefix",
|
117
126
|
)
|
118
127
|
parser.add_argument(
|
119
|
-
"--
|
128
|
+
"--gsp-system-prompt-len",
|
120
129
|
type=int,
|
121
|
-
default=BenchArgs.
|
130
|
+
default=BenchArgs.gsp_system_prompt_len,
|
122
131
|
help="System prompt length, used" "only for generate-shared-prefix",
|
123
132
|
)
|
124
133
|
parser.add_argument(
|
125
|
-
"--
|
134
|
+
"--gsp-question-len",
|
126
135
|
type=int,
|
127
|
-
default=BenchArgs.
|
136
|
+
default=BenchArgs.gsp_question_len,
|
128
137
|
help="Question length, used" "only for generate-shared-prefix",
|
129
138
|
)
|
130
139
|
parser.add_argument(
|
131
|
-
"--
|
140
|
+
"--gsp-output-len",
|
132
141
|
type=int,
|
133
|
-
default=BenchArgs.
|
142
|
+
default=BenchArgs.gsp_output_len,
|
134
143
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
135
144
|
)
|
145
|
+
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
136
146
|
parser.add_argument(
|
137
147
|
"--disable-ignore-eos",
|
138
|
-
|
139
|
-
default=BenchArgs.disable_ignore_eos,
|
148
|
+
action="store_true",
|
140
149
|
help="Disable ignore EOS token",
|
141
150
|
)
|
142
151
|
parser.add_argument(
|
143
152
|
"--extra-request-body",
|
144
153
|
metavar='{"key1": "value1", "key2": "value2"}',
|
145
154
|
type=str,
|
155
|
+
default=BenchArgs.extra_request_body,
|
146
156
|
help="Append given JSON object to the request payload. You can use this to specify"
|
147
157
|
"additional generate params like sampling params.",
|
148
158
|
)
|
149
|
-
parser.add_argument(
|
159
|
+
parser.add_argument(
|
160
|
+
"--apply-chat-template",
|
161
|
+
action="store_true",
|
162
|
+
help="Apply chat template",
|
163
|
+
)
|
164
|
+
parser.add_argument(
|
165
|
+
"--profile",
|
166
|
+
action="store_true",
|
167
|
+
help="Use Torch Profiler. The endpoint must be launched with "
|
168
|
+
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
169
|
+
)
|
150
170
|
parser.add_argument(
|
151
171
|
"--skip-warmup",
|
152
172
|
action="store_true",
|
@@ -157,12 +177,6 @@ class BenchArgs:
|
|
157
177
|
action="store_true",
|
158
178
|
help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
|
159
179
|
)
|
160
|
-
parser.add_argument(
|
161
|
-
"--profile",
|
162
|
-
action="store_true",
|
163
|
-
help="Use Torch Profiler. The endpoint must be launched with "
|
164
|
-
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
165
|
-
)
|
166
180
|
|
167
181
|
@classmethod
|
168
182
|
def from_cli_args(cls, args: argparse.Namespace):
|
sglang/bench_one_batch.py
CHANGED
@@ -9,7 +9,8 @@ It accepts server arguments (the same as launch_server.py) and benchmark argumen
|
|
9
9
|
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
|
10
10
|
## sweep through multiple data points and store (append) the results in a jsonl file:
|
11
11
|
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
|
12
|
-
|
12
|
+
## run with profiling:
|
13
|
+
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
|
13
14
|
# Usage (correctness test):
|
14
15
|
python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
|
15
16
|
|
@@ -56,15 +57,21 @@ import torch
|
|
56
57
|
import torch.distributed as dist
|
57
58
|
|
58
59
|
from sglang.srt.configs.model_config import ModelConfig
|
60
|
+
from sglang.srt.entrypoints.engine import _set_envs_and_config
|
59
61
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
60
62
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
61
63
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
62
64
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
63
65
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
64
|
-
from sglang.srt.server import _set_envs_and_config
|
65
66
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
66
67
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
67
|
-
from sglang.srt.utils import
|
68
|
+
from sglang.srt.utils import (
|
69
|
+
configure_logger,
|
70
|
+
get_bool_env_var,
|
71
|
+
kill_process_tree,
|
72
|
+
set_gpu_proc_affinity,
|
73
|
+
suppress_other_loggers,
|
74
|
+
)
|
68
75
|
|
69
76
|
|
70
77
|
@dataclasses.dataclass
|
@@ -77,6 +84,8 @@ class BenchArgs:
|
|
77
84
|
correctness_test: bool = False
|
78
85
|
# This is only used for correctness test
|
79
86
|
cut_len: int = 4
|
87
|
+
profile: bool = False
|
88
|
+
profile_filename_prefix: str = "profile"
|
80
89
|
|
81
90
|
@staticmethod
|
82
91
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -95,6 +104,16 @@ class BenchArgs:
|
|
95
104
|
)
|
96
105
|
parser.add_argument("--correctness-test", action="store_true")
|
97
106
|
parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
|
107
|
+
parser.add_argument(
|
108
|
+
"--profile", action="store_true", help="Use Torch Profiler."
|
109
|
+
)
|
110
|
+
parser.add_argument(
|
111
|
+
"--profile-filename-prefix",
|
112
|
+
type=str,
|
113
|
+
default=BenchArgs.profile_filename_prefix,
|
114
|
+
help="Prefix of the profiling file names. The full profiling result file(s) be "
|
115
|
+
'"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
|
116
|
+
)
|
98
117
|
|
99
118
|
@classmethod
|
100
119
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -216,6 +235,7 @@ def extend(reqs, model_runner):
|
|
216
235
|
model_config=model_runner.model_config,
|
217
236
|
enable_overlap=False,
|
218
237
|
spec_algorithm=SpeculativeAlgorithm.NONE,
|
238
|
+
enable_custom_logit_processor=False,
|
219
239
|
)
|
220
240
|
batch.prepare_for_extend()
|
221
241
|
model_worker_batch = batch.get_model_worker_batch()
|
@@ -286,7 +306,16 @@ def synchronize(device):
|
|
286
306
|
|
287
307
|
|
288
308
|
def latency_test_run_once(
|
289
|
-
run_name,
|
309
|
+
run_name,
|
310
|
+
model_runner,
|
311
|
+
rank_print,
|
312
|
+
reqs,
|
313
|
+
batch_size,
|
314
|
+
input_len,
|
315
|
+
output_len,
|
316
|
+
device,
|
317
|
+
profile,
|
318
|
+
profile_filename_prefix,
|
290
319
|
):
|
291
320
|
max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
|
292
321
|
if batch_size > max_batch_size:
|
@@ -308,6 +337,17 @@ def latency_test_run_once(
|
|
308
337
|
|
309
338
|
tot_latency = 0
|
310
339
|
|
340
|
+
profiler = None
|
341
|
+
if profile:
|
342
|
+
profiler = torch.profiler.profile(
|
343
|
+
activities=[
|
344
|
+
torch.profiler.ProfilerActivity.CPU,
|
345
|
+
torch.profiler.ProfilerActivity.CUDA,
|
346
|
+
],
|
347
|
+
with_stack=True,
|
348
|
+
)
|
349
|
+
profiler.start()
|
350
|
+
|
311
351
|
# Prefill
|
312
352
|
synchronize(device)
|
313
353
|
tic = time.time()
|
@@ -338,6 +378,14 @@ def latency_test_run_once(
|
|
338
378
|
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
339
379
|
)
|
340
380
|
|
381
|
+
if profile:
|
382
|
+
profiler.stop()
|
383
|
+
profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
|
384
|
+
parent_dir = os.path.dirname(os.path.abspath(profile_filename))
|
385
|
+
os.makedirs(parent_dir, exist_ok=True)
|
386
|
+
profiler.export_chrome_trace(profile_filename)
|
387
|
+
rank_print(f"torch profiler chrome trace saved to {profile_filename}")
|
388
|
+
|
341
389
|
# Record decode timing from 2nd output
|
342
390
|
if output_len > 1:
|
343
391
|
med_decode_latency = np.median(decode_latencies)
|
@@ -363,6 +411,10 @@ def latency_test(
|
|
363
411
|
bench_args,
|
364
412
|
tp_rank,
|
365
413
|
):
|
414
|
+
# Set CPU affinity
|
415
|
+
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
416
|
+
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
|
417
|
+
|
366
418
|
# Configure the logger
|
367
419
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
368
420
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
@@ -386,6 +438,8 @@ def latency_test(
|
|
386
438
|
bench_args.input_len[0],
|
387
439
|
8, # shorter decoding to speed up the warmup
|
388
440
|
server_args.device,
|
441
|
+
profile=False,
|
442
|
+
profile_filename_prefix="", # not used
|
389
443
|
)
|
390
444
|
|
391
445
|
rank_print("Benchmark ...")
|
@@ -405,6 +459,8 @@ def latency_test(
|
|
405
459
|
il,
|
406
460
|
ol,
|
407
461
|
server_args.device,
|
462
|
+
bench_args.profile if tp_rank == 0 else None,
|
463
|
+
bench_args.profile_filename_prefix,
|
408
464
|
)
|
409
465
|
if ret is not None:
|
410
466
|
result_list.append(ret)
|
sglang/bench_one_batch_server.py
CHANGED
@@ -22,7 +22,7 @@ from typing import Tuple
|
|
22
22
|
import numpy as np
|
23
23
|
import requests
|
24
24
|
|
25
|
-
from sglang.srt.
|
25
|
+
from sglang.srt.entrypoints.http_server import launch_server
|
26
26
|
from sglang.srt.server_args import ServerArgs
|
27
27
|
from sglang.srt.utils import kill_process_tree
|
28
28
|
|