sglang 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +6 -6
- sglang/bench_one_batch.py +1 -0
- sglang/bench_serving.py +9 -1
- sglang/check_env.py +140 -48
- sglang/lang/backend/runtime_endpoint.py +1 -0
- sglang/lang/chat_template.py +32 -0
- sglang/llama3_eval.py +316 -0
- sglang/srt/aio_rwlock.py +100 -0
- sglang/srt/configs/model_config.py +8 -1
- sglang/srt/constrained/xgrammar_backend.py +4 -1
- sglang/srt/layers/attention/flashinfer_backend.py +51 -5
- sglang/srt/layers/attention/triton_backend.py +16 -25
- sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
- sglang/srt/layers/linear.py +20 -2
- sglang/srt/layers/logits_processor.py +133 -95
- sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +18 -39
- sglang/srt/layers/moe/fused_moe_native.py +46 -0
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +174 -119
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +17 -49
- sglang/srt/layers/moe/topk.py +191 -0
- sglang/srt/layers/quantization/__init__.py +5 -50
- sglang/srt/layers/quantization/fp8.py +221 -36
- sglang/srt/layers/quantization/fp8_kernel.py +278 -0
- sglang/srt/layers/quantization/fp8_utils.py +90 -1
- sglang/srt/layers/radix_attention.py +8 -1
- sglang/srt/layers/sampler.py +27 -5
- sglang/srt/layers/torchao_utils.py +31 -0
- sglang/srt/managers/detokenizer_manager.py +37 -17
- sglang/srt/managers/io_struct.py +39 -10
- sglang/srt/managers/schedule_batch.py +54 -34
- sglang/srt/managers/schedule_policy.py +64 -5
- sglang/srt/managers/scheduler.py +171 -136
- sglang/srt/managers/tokenizer_manager.py +184 -133
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +2 -2
- sglang/srt/mem_cache/memory_pool.py +15 -8
- sglang/srt/mem_cache/radix_cache.py +12 -2
- sglang/srt/model_executor/cuda_graph_runner.py +25 -11
- sglang/srt/model_executor/model_runner.py +28 -14
- sglang/srt/model_parallel.py +66 -5
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_v2.py +67 -18
- sglang/srt/models/gemma2.py +34 -0
- sglang/srt/models/gemma2_reward.py +0 -1
- sglang/srt/models/granite.py +517 -0
- sglang/srt/models/grok.py +73 -9
- sglang/srt/models/llama.py +22 -0
- sglang/srt/models/llama_classification.py +11 -23
- sglang/srt/models/llama_reward.py +0 -2
- sglang/srt/models/llava.py +37 -14
- sglang/srt/models/mixtral.py +2 -2
- sglang/srt/models/olmoe.py +1 -1
- sglang/srt/models/qwen2.py +20 -0
- sglang/srt/models/qwen2_moe.py +1 -1
- sglang/srt/models/xverse_moe.py +1 -1
- sglang/srt/openai_api/adapter.py +8 -0
- sglang/srt/openai_api/protocol.py +9 -4
- sglang/srt/server.py +2 -1
- sglang/srt/server_args.py +19 -9
- sglang/srt/utils.py +40 -54
- sglang/test/test_block_fp8.py +341 -0
- sglang/test/test_utils.py +3 -2
- sglang/utils.py +10 -3
- sglang/version.py +1 -1
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/METADATA +12 -7
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/RECORD +73 -67
- sglang/srt/layers/fused_moe_patch.py +0 -133
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0
@@ -201,18 +201,17 @@ def throughput_test_once(
|
|
201
201
|
for r in reqs
|
202
202
|
]
|
203
203
|
|
204
|
-
st = time.perf_counter()
|
205
204
|
if profile:
|
206
205
|
backend.start_profile()
|
207
206
|
|
207
|
+
st = time.perf_counter()
|
208
208
|
gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
|
209
|
+
latency = time.perf_counter() - st
|
209
210
|
|
210
211
|
if profile:
|
211
212
|
backend.stop_profile()
|
212
213
|
monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
|
213
214
|
|
214
|
-
latency = time.perf_counter() - st
|
215
|
-
|
216
215
|
if backend_name == "runtime":
|
217
216
|
gen_out = json.loads(gen_out)
|
218
217
|
|
@@ -285,7 +284,7 @@ def throughput_test(
|
|
285
284
|
else:
|
286
285
|
raise ValueError('Please set backend to either "engine" or "runtime"')
|
287
286
|
|
288
|
-
tokenizer_id = server_args.model_path
|
287
|
+
tokenizer_id = server_args.tokenizer_path or server_args.model_path
|
289
288
|
tokenizer = get_tokenizer(tokenizer_id)
|
290
289
|
|
291
290
|
# Set global environmnets
|
@@ -304,8 +303,8 @@ def throughput_test(
|
|
304
303
|
warmup_requests = sample_random_requests(
|
305
304
|
input_len=256,
|
306
305
|
output_len=16,
|
307
|
-
num_prompts=16,
|
308
|
-
range_ratio=0
|
306
|
+
num_prompts=min(bench_args.num_prompts, 16),
|
307
|
+
range_ratio=1.0,
|
309
308
|
tokenizer=tokenizer,
|
310
309
|
dataset_path=bench_args.dataset_path,
|
311
310
|
)
|
@@ -321,6 +320,7 @@ def throughput_test(
|
|
321
320
|
extra_request_body=extra_request_body,
|
322
321
|
profile=False,
|
323
322
|
)
|
323
|
+
time.sleep(0.5)
|
324
324
|
|
325
325
|
logging.info("\nBenchmark...")
|
326
326
|
result = throughput_test_once(
|
sglang/bench_one_batch.py
CHANGED
sglang/bench_serving.py
CHANGED
@@ -321,6 +321,8 @@ async def async_request_sglang_generate(
|
|
321
321
|
},
|
322
322
|
"stream": not args.disable_stream,
|
323
323
|
"lora_path": request_func_input.lora_name,
|
324
|
+
"return_logprob": args.return_logprob,
|
325
|
+
"logprob_start_len": -1,
|
324
326
|
**request_func_input.extra_request_body,
|
325
327
|
}
|
326
328
|
headers = {}
|
@@ -911,7 +913,7 @@ async def benchmark(
|
|
911
913
|
prompt=test_prompt,
|
912
914
|
api_url=api_url,
|
913
915
|
prompt_len=test_prompt_len,
|
914
|
-
output_len=test_output_len,
|
916
|
+
output_len=min(test_output_len, 32),
|
915
917
|
lora_name=lora_name,
|
916
918
|
extra_request_body=extra_request_body,
|
917
919
|
)
|
@@ -922,6 +924,7 @@ async def benchmark(
|
|
922
924
|
f"are correctly specified. Error: {test_output.error}"
|
923
925
|
)
|
924
926
|
else:
|
927
|
+
requests.post(base_url + "/flush_cache")
|
925
928
|
print("Initial test run completed. Starting main benchmark run...")
|
926
929
|
|
927
930
|
time.sleep(1.5)
|
@@ -1413,6 +1416,11 @@ if __name__ == "__main__":
|
|
1413
1416
|
action="store_true",
|
1414
1417
|
help="Disable ignoring EOS.",
|
1415
1418
|
)
|
1419
|
+
parser.add_argument(
|
1420
|
+
"--return-logprob",
|
1421
|
+
action="store_true",
|
1422
|
+
help="Return logprob.",
|
1423
|
+
)
|
1416
1424
|
parser.add_argument(
|
1417
1425
|
"--extra-request-body",
|
1418
1426
|
metavar='{"key1": "value1", "key2": "value2"}',
|
sglang/check_env.py
CHANGED
@@ -9,6 +9,13 @@ from collections import OrderedDict, defaultdict
|
|
9
9
|
|
10
10
|
import torch
|
11
11
|
|
12
|
+
from sglang.srt.utils import is_hip
|
13
|
+
|
14
|
+
|
15
|
+
def is_cuda_v2():
|
16
|
+
return torch.version.cuda is not None
|
17
|
+
|
18
|
+
|
12
19
|
# List of packages to check versions
|
13
20
|
PACKAGE_LIST = [
|
14
21
|
"sglang",
|
@@ -63,13 +70,22 @@ def get_cuda_info():
|
|
63
70
|
"""
|
64
71
|
Get CUDA-related information if available.
|
65
72
|
"""
|
66
|
-
|
73
|
+
if is_cuda_v2():
|
74
|
+
cuda_info = {"CUDA available": torch.cuda.is_available()}
|
75
|
+
|
76
|
+
if cuda_info["CUDA available"]:
|
77
|
+
cuda_info.update(_get_gpu_info())
|
78
|
+
cuda_info.update(_get_cuda_version_info())
|
79
|
+
|
80
|
+
return cuda_info
|
81
|
+
elif is_hip():
|
82
|
+
cuda_info = {"ROCM available": torch.cuda.is_available()}
|
67
83
|
|
68
|
-
|
69
|
-
|
70
|
-
|
84
|
+
if cuda_info["ROCM available"]:
|
85
|
+
cuda_info.update(_get_gpu_info())
|
86
|
+
cuda_info.update(_get_cuda_version_info())
|
71
87
|
|
72
|
-
|
88
|
+
return cuda_info
|
73
89
|
|
74
90
|
|
75
91
|
def _get_gpu_info():
|
@@ -103,34 +119,72 @@ def _get_cuda_version_info():
|
|
103
119
|
"""
|
104
120
|
Get CUDA version information.
|
105
121
|
"""
|
106
|
-
|
122
|
+
if is_cuda_v2():
|
123
|
+
from torch.utils.cpp_extension import CUDA_HOME
|
107
124
|
|
108
|
-
|
125
|
+
cuda_info = {"CUDA_HOME": CUDA_HOME}
|
109
126
|
|
110
|
-
|
111
|
-
|
112
|
-
|
127
|
+
if CUDA_HOME and os.path.isdir(CUDA_HOME):
|
128
|
+
cuda_info.update(_get_nvcc_info())
|
129
|
+
cuda_info.update(_get_cuda_driver_version())
|
113
130
|
|
114
|
-
|
131
|
+
return cuda_info
|
132
|
+
elif is_hip():
|
133
|
+
from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
|
134
|
+
|
135
|
+
cuda_info = {"ROCM_HOME": ROCM_HOME}
|
136
|
+
|
137
|
+
if ROCM_HOME and os.path.isdir(ROCM_HOME):
|
138
|
+
cuda_info.update(_get_nvcc_info())
|
139
|
+
cuda_info.update(_get_cuda_driver_version())
|
140
|
+
|
141
|
+
return cuda_info
|
142
|
+
else:
|
143
|
+
cuda_info = {"CUDA_HOME": ""}
|
144
|
+
return cuda_info
|
115
145
|
|
116
146
|
|
117
147
|
def _get_nvcc_info():
|
118
148
|
"""
|
119
149
|
Get NVCC version information.
|
120
150
|
"""
|
121
|
-
|
151
|
+
if is_cuda_v2():
|
152
|
+
from torch.utils.cpp_extension import CUDA_HOME
|
122
153
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
154
|
+
try:
|
155
|
+
nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
|
156
|
+
nvcc_output = (
|
157
|
+
subprocess.check_output(f'"{nvcc}" -V', shell=True)
|
158
|
+
.decode("utf-8")
|
159
|
+
.strip()
|
160
|
+
)
|
161
|
+
return {
|
162
|
+
"NVCC": nvcc_output[
|
163
|
+
nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind(
|
164
|
+
"Build"
|
165
|
+
)
|
166
|
+
].strip()
|
167
|
+
}
|
168
|
+
except subprocess.SubprocessError:
|
169
|
+
return {"NVCC": "Not Available"}
|
170
|
+
elif is_hip():
|
171
|
+
from torch.utils.cpp_extension import ROCM_HOME
|
172
|
+
|
173
|
+
try:
|
174
|
+
hipcc = os.path.join(ROCM_HOME, "bin/hipcc")
|
175
|
+
hipcc_output = (
|
176
|
+
subprocess.check_output(f'"{hipcc}" --version', shell=True)
|
177
|
+
.decode("utf-8")
|
178
|
+
.strip()
|
179
|
+
)
|
180
|
+
return {
|
181
|
+
"HIPCC": hipcc_output[
|
182
|
+
hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang")
|
183
|
+
].strip()
|
184
|
+
}
|
185
|
+
except subprocess.SubprocessError:
|
186
|
+
return {"HIPCC": "Not Available"}
|
187
|
+
else:
|
134
188
|
return {"NVCC": "Not Available"}
|
135
189
|
|
136
190
|
|
@@ -139,20 +193,40 @@ def _get_cuda_driver_version():
|
|
139
193
|
Get CUDA driver version.
|
140
194
|
"""
|
141
195
|
versions = set()
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
196
|
+
if is_cuda_v2():
|
197
|
+
try:
|
198
|
+
output = subprocess.check_output(
|
199
|
+
[
|
200
|
+
"nvidia-smi",
|
201
|
+
"--query-gpu=driver_version",
|
202
|
+
"--format=csv,noheader,nounits",
|
203
|
+
]
|
204
|
+
)
|
205
|
+
versions = set(output.decode().strip().split("\n"))
|
206
|
+
if len(versions) == 1:
|
207
|
+
return {"CUDA Driver Version": versions.pop()}
|
208
|
+
else:
|
209
|
+
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
|
210
|
+
except subprocess.SubprocessError:
|
211
|
+
return {"CUDA Driver Version": "Not Available"}
|
212
|
+
elif is_hip():
|
213
|
+
try:
|
214
|
+
output = subprocess.check_output(
|
215
|
+
[
|
216
|
+
"rocm-smi",
|
217
|
+
"--showdriverversion",
|
218
|
+
"--csv",
|
219
|
+
]
|
220
|
+
)
|
221
|
+
versions = set(output.decode().strip().split("\n"))
|
222
|
+
versions.discard("name, value")
|
223
|
+
ver = versions.pop()
|
224
|
+
ver = ver.replace('"Driver version", ', "").replace('"', "")
|
225
|
+
|
226
|
+
return {"ROCM Driver Version": ver}
|
227
|
+
except subprocess.SubprocessError:
|
228
|
+
return {"ROCM Driver Version": "Not Available"}
|
229
|
+
else:
|
156
230
|
return {"CUDA Driver Version": "Not Available"}
|
157
231
|
|
158
232
|
|
@@ -160,16 +234,31 @@ def get_gpu_topology():
|
|
160
234
|
"""
|
161
235
|
Get GPU topology information.
|
162
236
|
"""
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
237
|
+
if is_cuda_v2():
|
238
|
+
try:
|
239
|
+
result = subprocess.run(
|
240
|
+
["nvidia-smi", "topo", "-m"],
|
241
|
+
stdout=subprocess.PIPE,
|
242
|
+
stderr=subprocess.PIPE,
|
243
|
+
text=True,
|
244
|
+
check=True,
|
245
|
+
)
|
246
|
+
return "\n" + result.stdout if result.returncode == 0 else None
|
247
|
+
except subprocess.SubprocessError:
|
248
|
+
return None
|
249
|
+
elif is_hip():
|
250
|
+
try:
|
251
|
+
result = subprocess.run(
|
252
|
+
["rocm-smi", "--showtopotype"],
|
253
|
+
stdout=subprocess.PIPE,
|
254
|
+
stderr=subprocess.PIPE,
|
255
|
+
text=True,
|
256
|
+
check=True,
|
257
|
+
)
|
258
|
+
return "\n" + result.stdout if result.returncode == 0 else None
|
259
|
+
except subprocess.SubprocessError:
|
260
|
+
return None
|
261
|
+
else:
|
173
262
|
return None
|
174
263
|
|
175
264
|
|
@@ -196,7 +285,10 @@ def check_env():
|
|
196
285
|
|
197
286
|
gpu_topo = get_gpu_topology()
|
198
287
|
if gpu_topo:
|
199
|
-
|
288
|
+
if is_cuda_v2():
|
289
|
+
env_info["NVIDIA Topology"] = gpu_topo
|
290
|
+
elif is_hip():
|
291
|
+
env_info["AMD Topology"] = gpu_topo
|
200
292
|
|
201
293
|
hypervisor_vendor = get_hypervisor_vendor()
|
202
294
|
if hypervisor_vendor:
|
sglang/lang/chat_template.py
CHANGED
@@ -320,6 +320,28 @@ register_chat_template(
|
|
320
320
|
)
|
321
321
|
)
|
322
322
|
|
323
|
+
register_chat_template(
|
324
|
+
ChatTemplate(
|
325
|
+
name="granite-3-instruct",
|
326
|
+
default_system_prompt=None,
|
327
|
+
role_prefix_and_suffix={
|
328
|
+
"system": (
|
329
|
+
"<|start_of_role|>system<|end_of_role|>",
|
330
|
+
"<|end_of_text|>",
|
331
|
+
),
|
332
|
+
"user": (
|
333
|
+
"<|start_of_role|>user<|end_of_role|>",
|
334
|
+
"<|end_of_text|>",
|
335
|
+
),
|
336
|
+
"assistant": (
|
337
|
+
"<|start_of_role|>assistant<|end_of_role|>",
|
338
|
+
"<|end_of_text|>",
|
339
|
+
),
|
340
|
+
},
|
341
|
+
stop_str=("<|end_of_text|>",),
|
342
|
+
)
|
343
|
+
)
|
344
|
+
|
323
345
|
|
324
346
|
@register_chat_template_matching_function
|
325
347
|
def match_dbrx(model_path: str):
|
@@ -402,6 +424,16 @@ def match_c4ai_command_r(model_path: str):
|
|
402
424
|
return get_chat_template("c4ai-command-r")
|
403
425
|
|
404
426
|
|
427
|
+
@register_chat_template_matching_function
|
428
|
+
def match_granite_instruct(model_path: str):
|
429
|
+
model_path = model_path.lower()
|
430
|
+
# When future versions of Granite are released, this code may
|
431
|
+
# need to be updated. For now, assume that the Granite 3.0
|
432
|
+
# template works across the board.
|
433
|
+
if "granite" in model_path and "instruct" in model_path:
|
434
|
+
return get_chat_template("granite-3-instruct")
|
435
|
+
|
436
|
+
|
405
437
|
if __name__ == "__main__":
|
406
438
|
messages = [
|
407
439
|
{"role": "system", "content": None}, # None means default
|