sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -2
- sglang/bench_serving.py +224 -127
- sglang/compile_deep_gemm.py +3 -0
- sglang/launch_server.py +0 -14
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/falcon_h1.py +12 -58
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +68 -31
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +11 -43
- sglang/srt/disaggregation/decode.py +7 -18
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/nixl/conn.py +55 -23
- sglang/srt/disaggregation/prefill.py +17 -32
- sglang/srt/entrypoints/engine.py +2 -2
- sglang/srt/entrypoints/grpc_request_manager.py +10 -23
- sglang/srt/entrypoints/grpc_server.py +220 -80
- sglang/srt/entrypoints/http_server.py +49 -1
- sglang/srt/entrypoints/openai/protocol.py +159 -31
- sglang/srt/entrypoints/openai/serving_chat.py +13 -71
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +4 -0
- sglang/srt/function_call/function_call_parser.py +8 -6
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
- sglang/srt/layers/attention/attention_registry.py +31 -22
- sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
- sglang/srt/layers/attention/flashattention_backend.py +0 -1
- sglang/srt/layers/attention/flashinfer_backend.py +223 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/triton_backend.py +1 -1
- sglang/srt/layers/logits_processor.py +136 -6
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
- sglang/srt/layers/moe/ep_moe/layer.py +8 -286
- sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/utils.py +7 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/modelopt_quant.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/w4afp8.py +2 -16
- sglang/srt/lora/lora_manager.py +0 -8
- sglang/srt/managers/overlap_utils.py +18 -16
- sglang/srt/managers/schedule_batch.py +119 -90
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +213 -126
- sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
- sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
- sglang/srt/managers/tokenizer_manager.py +270 -53
- sglang/srt/managers/tp_worker.py +39 -28
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +162 -68
- sglang/srt/mem_cache/radix_cache.py +8 -3
- sglang/srt/mem_cache/swa_radix_cache.py +70 -14
- sglang/srt/model_executor/cuda_graph_runner.py +1 -1
- sglang/srt/model_executor/forward_batch_info.py +4 -18
- sglang/srt/model_executor/model_runner.py +55 -51
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +187 -6
- sglang/srt/model_loader/weight_utils.py +3 -0
- sglang/srt/models/falcon_h1.py +11 -9
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +11 -1
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/utils.py +5 -1
- sglang/srt/sampling/sampling_batch_info.py +11 -9
- sglang/srt/server_args.py +100 -33
- sglang/srt/speculative/eagle_worker.py +11 -13
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_utils.py +0 -1
- sglang/srt/two_batch_overlap.py +1 -0
- sglang/srt/utils/common.py +18 -0
- sglang/srt/utils/hf_transformers_utils.py +2 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +40 -0
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +18 -2
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +63 -0
- sglang/test/test_utils.py +32 -11
- sglang/version.py +1 -1
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -120,7 +120,7 @@ def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dty
|
|
120
120
|
)
|
121
121
|
topk_weights, topk_ids, _ = topk_output
|
122
122
|
expert_map = torch.arange(E, dtype=torch.int32, device=device)
|
123
|
-
expert_map[local_e:] =
|
123
|
+
expert_map[local_e:] = -1
|
124
124
|
|
125
125
|
output = cutlass_moe(
|
126
126
|
a,
|
@@ -138,9 +138,7 @@ def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dty
|
|
138
138
|
c_strides2,
|
139
139
|
s_strides13,
|
140
140
|
s_strides2,
|
141
|
-
|
142
|
-
local_e - 1,
|
143
|
-
E,
|
141
|
+
local_e,
|
144
142
|
a1_scale,
|
145
143
|
a2_scale,
|
146
144
|
expert_map,
|
@@ -178,7 +176,7 @@ def cutlass_moe(
|
|
178
176
|
w1_scale: torch.Tensor,
|
179
177
|
w2_scale: torch.Tensor,
|
180
178
|
topk_weights: torch.Tensor,
|
181
|
-
|
179
|
+
topk_ids: torch.Tensor,
|
182
180
|
a_strides1: torch.Tensor,
|
183
181
|
b_strides1: torch.Tensor,
|
184
182
|
c_strides1: torch.Tensor,
|
@@ -187,40 +185,32 @@ def cutlass_moe(
|
|
187
185
|
c_strides2: torch.Tensor,
|
188
186
|
s_strides13: torch.Tensor,
|
189
187
|
s_strides2: torch.Tensor,
|
190
|
-
|
191
|
-
end_expert_id: int,
|
192
|
-
E: int,
|
188
|
+
num_local_experts: int,
|
193
189
|
a1_scale: Optional[torch.Tensor] = None,
|
194
190
|
a2_scale: Optional[torch.Tensor] = None,
|
195
191
|
expert_map: Optional[torch.Tensor] = None,
|
196
192
|
apply_router_weight_on_input: bool = False,
|
197
193
|
):
|
198
|
-
|
199
|
-
local_topk_ids = torch.where(expert_map[topk_ids_] != E, expert_map[topk_ids_], E)
|
194
|
+
topk_ids = expert_map[topk_ids]
|
200
195
|
device = a.device
|
201
196
|
|
202
|
-
local_num_experts = end_expert_id - start_expert_id + 1
|
203
197
|
expert_offsets = torch.empty(
|
204
|
-
(
|
198
|
+
(num_local_experts + 1), dtype=torch.int32, device=device
|
205
199
|
)
|
206
200
|
problem_sizes1 = torch.empty(
|
207
|
-
(
|
201
|
+
(num_local_experts, 3), dtype=torch.int32, device=device
|
208
202
|
)
|
209
203
|
problem_sizes2 = torch.empty(
|
210
|
-
(
|
204
|
+
(num_local_experts, 3), dtype=torch.int32, device=device
|
211
205
|
)
|
212
206
|
return cutlass_w4a8_moe(
|
213
|
-
start_expert_id,
|
214
|
-
end_expert_id,
|
215
|
-
E,
|
216
207
|
a,
|
217
208
|
w1_q,
|
218
209
|
w2_q,
|
219
210
|
w1_scale,
|
220
211
|
w2_scale,
|
221
212
|
topk_weights,
|
222
|
-
|
223
|
-
local_topk_ids,
|
213
|
+
topk_ids,
|
224
214
|
a_strides1,
|
225
215
|
b_strides1,
|
226
216
|
c_strides1,
|
@@ -39,12 +39,15 @@ class BenchArgs:
|
|
39
39
|
profile_steps: int = 3
|
40
40
|
profile_by_stage: bool = False
|
41
41
|
test_mode: str = "single"
|
42
|
+
n_trials: int = 50
|
43
|
+
n_start: int = 1
|
42
44
|
|
43
45
|
@staticmethod
|
44
46
|
def add_cli_args(parser: argparse.ArgumentParser):
|
45
47
|
parser.add_argument("--host", type=str, default=BenchArgs.host)
|
46
48
|
parser.add_argument("--port", type=int, default=BenchArgs.port)
|
47
|
-
parser.add_argument("--n-trials", type=int, default=
|
49
|
+
parser.add_argument("--n-trials", type=int, default=BenchArgs.n_trials)
|
50
|
+
parser.add_argument("--n-start", type=int, default=BenchArgs.n_start)
|
48
51
|
parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
|
49
52
|
parser.add_argument(
|
50
53
|
"--sampling-seed", type=int, default=BenchArgs.sampling_seed
|
@@ -238,6 +241,8 @@ def test_deterministic(args):
|
|
238
241
|
texts.append(text)
|
239
242
|
|
240
243
|
print(f"Total samples: {len(texts)}, Unique samples: {len(set(texts))}")
|
244
|
+
return [len(set(texts))]
|
245
|
+
|
241
246
|
elif args.test_mode == "mixed":
|
242
247
|
# In mixed mode, we send a mixture of two short prompts and one long prompt in the same batch with batch size ranging from 1 to n_trials.
|
243
248
|
output_prompt_1 = []
|
@@ -264,13 +269,19 @@ def test_deterministic(args):
|
|
264
269
|
f"Long prompt: total samples: {len(output_long_prompt)}, Unique samples: {len(set(output_long_prompt))}"
|
265
270
|
)
|
266
271
|
|
272
|
+
return [
|
273
|
+
len(set(output_prompt_1)),
|
274
|
+
len(set(output_prompt_2)),
|
275
|
+
len(set(output_long_prompt)),
|
276
|
+
]
|
277
|
+
|
267
278
|
elif args.test_mode == "prefix":
|
268
279
|
# In prefix mode, we create prompts from the same long prompt, with different lengths of common prefix.
|
269
280
|
len_prefix = [1, 511, 2048, 4097]
|
270
281
|
num_prompts = len(len_prefix)
|
271
282
|
outputs = {i: [] for i in range(4)}
|
272
283
|
prompts = [LONG_PROMPT[: len_prefix[i]] for i in range(4)]
|
273
|
-
for i in range(
|
284
|
+
for i in range(args.n_start, args.n_start + args.n_trials):
|
274
285
|
batch_size = i
|
275
286
|
ret_dict = send_prefix(args, batch_size, prompts)
|
276
287
|
msg = f"Testing Trial {i} with batch size {batch_size},"
|
@@ -285,6 +296,11 @@ def test_deterministic(args):
|
|
285
296
|
f"Prompt {i} with prefix length {len_prefix[i]}: total samples: {len(outputs[i])}, Unique samples: {len(set(outputs[i]))}"
|
286
297
|
)
|
287
298
|
|
299
|
+
results = []
|
300
|
+
for i in range(num_prompts):
|
301
|
+
results.append(len(set(outputs[i])))
|
302
|
+
return results
|
303
|
+
|
288
304
|
else:
|
289
305
|
raise ValueError(f"Invalid test mode: {args.test_mode}")
|
290
306
|
|
@@ -0,0 +1,81 @@
|
|
1
|
+
import time
|
2
|
+
import unittest
|
3
|
+
|
4
|
+
import requests
|
5
|
+
|
6
|
+
from sglang.srt.utils import kill_process_tree
|
7
|
+
from sglang.test.test_deterministic import BenchArgs, test_deterministic
|
8
|
+
from sglang.test.test_utils import (
|
9
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
10
|
+
DEFAULT_URL_FOR_TEST,
|
11
|
+
CustomTestCase,
|
12
|
+
popen_launch_server,
|
13
|
+
)
|
14
|
+
|
15
|
+
DEFAULT_MODEL = "Qwen/Qwen3-8B"
|
16
|
+
COMMON_SERVER_ARGS = [
|
17
|
+
"--trust-remote-code",
|
18
|
+
"--cuda-graph-max-bs",
|
19
|
+
"32",
|
20
|
+
"--enable-deterministic-inference",
|
21
|
+
]
|
22
|
+
|
23
|
+
|
24
|
+
class TestDeterministicBase(CustomTestCase):
|
25
|
+
@classmethod
|
26
|
+
def get_server_args(cls):
|
27
|
+
return COMMON_SERVER_ARGS
|
28
|
+
|
29
|
+
@classmethod
|
30
|
+
def setUpClass(cls):
|
31
|
+
cls.model = DEFAULT_MODEL
|
32
|
+
cls.base_url = DEFAULT_URL_FOR_TEST
|
33
|
+
if "--attention-backend" not in cls.get_server_args():
|
34
|
+
raise unittest.SkipTest("Skip the base test class")
|
35
|
+
|
36
|
+
cls.process = popen_launch_server(
|
37
|
+
cls.model,
|
38
|
+
cls.base_url,
|
39
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
40
|
+
other_args=cls.get_server_args(),
|
41
|
+
)
|
42
|
+
|
43
|
+
@classmethod
|
44
|
+
def tearDownClass(cls):
|
45
|
+
kill_process_tree(cls.process.pid)
|
46
|
+
|
47
|
+
def _extract_host_and_port(self, url):
|
48
|
+
return url.split("://")[-1].split(":")[0], int(url.split(":")[-1])
|
49
|
+
|
50
|
+
def test_single(self):
|
51
|
+
args = BenchArgs()
|
52
|
+
url = DEFAULT_URL_FOR_TEST
|
53
|
+
args.host, args.port = self._extract_host_and_port(url)
|
54
|
+
args.test_mode = "single"
|
55
|
+
args.n_start = 10
|
56
|
+
args.n_trials = 20
|
57
|
+
results = test_deterministic(args)
|
58
|
+
for result in results:
|
59
|
+
assert result == 1
|
60
|
+
|
61
|
+
def test_mixed(self):
|
62
|
+
args = BenchArgs()
|
63
|
+
url = DEFAULT_URL_FOR_TEST
|
64
|
+
args.host, args.port = self._extract_host_and_port(url)
|
65
|
+
args.test_mode = "mixed"
|
66
|
+
args.n_start = 10
|
67
|
+
args.n_trials = 20
|
68
|
+
results = test_deterministic(args)
|
69
|
+
for result in results:
|
70
|
+
assert result == 1
|
71
|
+
|
72
|
+
def test_prefix(self):
|
73
|
+
args = BenchArgs()
|
74
|
+
url = DEFAULT_URL_FOR_TEST
|
75
|
+
args.host, args.port = self._extract_host_and_port(url)
|
76
|
+
args.test_mode = "prefix"
|
77
|
+
args.n_start = 10
|
78
|
+
args.n_trials = 10
|
79
|
+
results = test_deterministic(args)
|
80
|
+
for result in results:
|
81
|
+
assert result == 1
|
@@ -1,13 +1,17 @@
|
|
1
|
+
import os
|
1
2
|
import time
|
3
|
+
import warnings
|
2
4
|
from urllib.parse import urlparse
|
3
5
|
|
4
6
|
import requests
|
5
7
|
|
8
|
+
from sglang.srt.environ import envs
|
6
9
|
from sglang.srt.utils import kill_process_tree
|
7
10
|
from sglang.test.test_utils import (
|
8
11
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
9
12
|
DEFAULT_URL_FOR_TEST,
|
10
13
|
CustomTestCase,
|
14
|
+
is_in_ci,
|
11
15
|
popen_with_error_check,
|
12
16
|
)
|
13
17
|
|
@@ -27,6 +31,24 @@ class TestDisaggregationBase(CustomTestCase):
|
|
27
31
|
print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
|
28
32
|
cls.process_lb, cls.process_decode, cls.process_prefill = None, None, None
|
29
33
|
|
34
|
+
# config transfer backend and rdma devices
|
35
|
+
if is_in_ci():
|
36
|
+
cls.transfer_backend = ["--disaggregation-transfer-backend", "mooncake"]
|
37
|
+
cls.rdma_devices = ["--disaggregation-ib-device", get_rdma_devices_args()]
|
38
|
+
else:
|
39
|
+
cls.transfer_backend = [
|
40
|
+
"--disaggregation-transfer-backend",
|
41
|
+
envs.SGLANG_TEST_PD_DISAGG_BACKEND.get(),
|
42
|
+
]
|
43
|
+
cls.rdma_devices = [
|
44
|
+
"--disaggregation-ib-device",
|
45
|
+
envs.SGLANG_TEST_PD_DISAGG_DEVICES.get(),
|
46
|
+
]
|
47
|
+
if cls.rdma_devices[1] is None:
|
48
|
+
cls.rdma_devices = []
|
49
|
+
msg = "No RDMA devices specified for disaggregation test, using default settings."
|
50
|
+
warnings.warn(msg)
|
51
|
+
|
30
52
|
@classmethod
|
31
53
|
def launch_lb(cls):
|
32
54
|
lb_command = [
|
@@ -75,3 +97,44 @@ class TestDisaggregationBase(CustomTestCase):
|
|
75
97
|
|
76
98
|
# wait for 5 seconds
|
77
99
|
time.sleep(5)
|
100
|
+
|
101
|
+
|
102
|
+
def get_rdma_devices_args():
|
103
|
+
# 1. Get visible GPU indices
|
104
|
+
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
|
105
|
+
if not cuda_visible_devices:
|
106
|
+
warnings.warn("CUDA_VISIBLE_DEVICES is not set. Using default RDMA devices.")
|
107
|
+
return "mlx5_roce0,mlx5_roce4"
|
108
|
+
|
109
|
+
try:
|
110
|
+
# Convert to list of integers (handling possible spaces and empty strings)
|
111
|
+
gpu_indices = [
|
112
|
+
int(idx.strip()) for idx in cuda_visible_devices.split(",") if idx.strip()
|
113
|
+
]
|
114
|
+
if not gpu_indices or len(gpu_indices) > 4:
|
115
|
+
return "mlx5_roce0,mlx5_roce4"
|
116
|
+
except ValueError:
|
117
|
+
warnings.warn(f"Invalid CUDA_VISIBLE_DEVICES format: {cuda_visible_devices}")
|
118
|
+
return "mlx5_roce0,mlx5_roce4"
|
119
|
+
|
120
|
+
# 2. Calculate base RDMA index group (each group of 4 GPUs uses consecutive devices)
|
121
|
+
base_rdma_group = min(gpu_indices) // 4 * 4
|
122
|
+
|
123
|
+
# 3. Generate RDMA device names
|
124
|
+
rdma_devices = []
|
125
|
+
for gpu_idx in gpu_indices:
|
126
|
+
# Validate GPU index within expected range
|
127
|
+
if gpu_idx < base_rdma_group or gpu_idx >= base_rdma_group + 4:
|
128
|
+
warnings.warn(
|
129
|
+
f"GPU index {gpu_idx} is outside expected group {base_rdma_group}-{base_rdma_group+3}"
|
130
|
+
)
|
131
|
+
continue
|
132
|
+
|
133
|
+
# Map GPU index to RDMA device index
|
134
|
+
rdma_index = base_rdma_group // 4 * 4 + (gpu_idx % 4)
|
135
|
+
rdma_devices.append(f"mlx5_roce{rdma_index}")
|
136
|
+
|
137
|
+
if not rdma_devices:
|
138
|
+
return "mlx5_roce0,mlx5_roce4"
|
139
|
+
|
140
|
+
return ",".join(rdma_devices)
|
sglang/test/test_utils.py
CHANGED
@@ -20,7 +20,6 @@ from functools import partial
|
|
20
20
|
from pathlib import Path
|
21
21
|
from types import SimpleNamespace
|
22
22
|
from typing import Any, Awaitable, Callable, List, Optional, Tuple
|
23
|
-
from urllib.parse import quote
|
24
23
|
|
25
24
|
import aiohttp
|
26
25
|
import numpy as np
|
@@ -509,6 +508,7 @@ def popen_launch_server(
|
|
509
508
|
return_stdout_stderr: Optional[tuple] = None,
|
510
509
|
device: str = "auto",
|
511
510
|
pd_separated: bool = False,
|
511
|
+
num_replicas: Optional[int] = None,
|
512
512
|
):
|
513
513
|
"""Launch a server process with automatic device detection.
|
514
514
|
|
@@ -526,7 +526,8 @@ def popen_launch_server(
|
|
526
526
|
_, host, port = base_url.split(":")
|
527
527
|
host = host[2:]
|
528
528
|
|
529
|
-
|
529
|
+
use_mixed_pd_engine = not pd_separated and num_replicas is not None
|
530
|
+
if pd_separated or use_mixed_pd_engine:
|
530
531
|
command = "sglang.launch_pd_server"
|
531
532
|
else:
|
532
533
|
command = "sglang.launch_server"
|
@@ -540,7 +541,7 @@ def popen_launch_server(
|
|
540
541
|
*[str(x) for x in other_args],
|
541
542
|
]
|
542
543
|
|
543
|
-
if pd_separated:
|
544
|
+
if pd_separated or use_mixed_pd_engine:
|
544
545
|
command.extend(
|
545
546
|
[
|
546
547
|
"--lb-host",
|
@@ -559,6 +560,15 @@ def popen_launch_server(
|
|
559
560
|
]
|
560
561
|
)
|
561
562
|
|
563
|
+
if use_mixed_pd_engine:
|
564
|
+
command.extend(
|
565
|
+
[
|
566
|
+
"--mixed",
|
567
|
+
"--num-replicas",
|
568
|
+
str(num_replicas),
|
569
|
+
]
|
570
|
+
)
|
571
|
+
|
562
572
|
if api_key:
|
563
573
|
command += ["--api-key", api_key]
|
564
574
|
|
@@ -1149,7 +1159,7 @@ def run_bench_offline_throughput(model, other_args):
|
|
1149
1159
|
*[str(x) for x in other_args],
|
1150
1160
|
]
|
1151
1161
|
|
1152
|
-
print(f"{command
|
1162
|
+
print(f"command={' '.join(command)}")
|
1153
1163
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
1154
1164
|
|
1155
1165
|
try:
|
@@ -1641,15 +1651,26 @@ def _ensure_remove_suffix(text: str, suffix: str):
|
|
1641
1651
|
return text.removesuffix(suffix)
|
1642
1652
|
|
1643
1653
|
|
1644
|
-
class
|
1645
|
-
def __init__(
|
1654
|
+
class ModelLaunchSettings:
|
1655
|
+
def __init__(
|
1656
|
+
self,
|
1657
|
+
model_path: str,
|
1658
|
+
tp_size: int = 1,
|
1659
|
+
extra_args: Optional[List[str]] = None,
|
1660
|
+
env: Optional[dict] = None,
|
1661
|
+
):
|
1646
1662
|
self.model_path = model_path
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1663
|
+
self.tp_size = tp_size
|
1664
|
+
self.extra_args = list(extra_args) if extra_args else []
|
1665
|
+
self.env = env
|
1666
|
+
|
1667
|
+
if self.tp_size > 1 and "--tp" not in self.extra_args:
|
1668
|
+
self.extra_args.extend(["--tp", str(self.tp_size)])
|
1651
1669
|
|
1652
|
-
|
1670
|
+
fixed_args = ["--enable-multimodal", "--trust-remote-code"]
|
1671
|
+
for fixed_arg in fixed_args:
|
1672
|
+
if fixed_arg not in self.extra_args:
|
1673
|
+
self.extra_args.append(fixed_arg)
|
1653
1674
|
|
1654
1675
|
|
1655
1676
|
class ModelEvalMetrics:
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.3"
|
1
|
+
__version__ = "0.5.3.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.3
|
3
|
+
Version: 0.5.3.post1
|
4
4
|
Summary: SGLang is a fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -221,7 +221,7 @@ Requires-Dist: cuda-python
|
|
221
221
|
Requires-Dist: datasets
|
222
222
|
Requires-Dist: einops
|
223
223
|
Requires-Dist: fastapi
|
224
|
-
Requires-Dist: flashinfer_python==0.4.
|
224
|
+
Requires-Dist: flashinfer_python==0.4.0
|
225
225
|
Requires-Dist: hf_transfer
|
226
226
|
Requires-Dist: huggingface_hub
|
227
227
|
Requires-Dist: interegular
|
@@ -250,7 +250,7 @@ Requires-Dist: requests
|
|
250
250
|
Requires-Dist: scipy
|
251
251
|
Requires-Dist: sentencepiece
|
252
252
|
Requires-Dist: setproctitle
|
253
|
-
Requires-Dist: sgl-kernel==0.3.
|
253
|
+
Requires-Dist: sgl-kernel==0.3.15
|
254
254
|
Requires-Dist: soundfile==0.13.1
|
255
255
|
Requires-Dist: tiktoken
|
256
256
|
Requires-Dist: timm==1.0.16
|
@@ -263,7 +263,7 @@ Requires-Dist: tqdm
|
|
263
263
|
Requires-Dist: transformers==4.57.0
|
264
264
|
Requires-Dist: uvicorn
|
265
265
|
Requires-Dist: uvloop
|
266
|
-
Requires-Dist: xgrammar==0.1.
|
266
|
+
Requires-Dist: xgrammar==0.1.25
|
267
267
|
Requires-Dist: grpcio==1.75.1
|
268
268
|
Requires-Dist: grpcio-tools==1.75.1
|
269
269
|
Provides-Extra: decord
|