sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. sglang/bench_one_batch.py +0 -2
  2. sglang/bench_serving.py +224 -127
  3. sglang/compile_deep_gemm.py +3 -0
  4. sglang/launch_server.py +0 -14
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/falcon_h1.py +12 -58
  7. sglang/srt/configs/mamba_utils.py +117 -0
  8. sglang/srt/configs/model_config.py +68 -31
  9. sglang/srt/configs/nemotron_h.py +286 -0
  10. sglang/srt/configs/qwen3_next.py +11 -43
  11. sglang/srt/disaggregation/decode.py +7 -18
  12. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  13. sglang/srt/disaggregation/nixl/conn.py +55 -23
  14. sglang/srt/disaggregation/prefill.py +17 -32
  15. sglang/srt/entrypoints/engine.py +2 -2
  16. sglang/srt/entrypoints/grpc_request_manager.py +10 -23
  17. sglang/srt/entrypoints/grpc_server.py +220 -80
  18. sglang/srt/entrypoints/http_server.py +49 -1
  19. sglang/srt/entrypoints/openai/protocol.py +159 -31
  20. sglang/srt/entrypoints/openai/serving_chat.py +13 -71
  21. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  22. sglang/srt/environ.py +4 -0
  23. sglang/srt/function_call/function_call_parser.py +8 -6
  24. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  25. sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
  26. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
  27. sglang/srt/layers/attention/attention_registry.py +31 -22
  28. sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
  29. sglang/srt/layers/attention/flashattention_backend.py +0 -1
  30. sglang/srt/layers/attention/flashinfer_backend.py +223 -6
  31. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
  32. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
  33. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  34. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
  35. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  36. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  37. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  38. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  39. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  40. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  41. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  42. sglang/srt/layers/attention/triton_backend.py +1 -1
  43. sglang/srt/layers/logits_processor.py +136 -6
  44. sglang/srt/layers/modelopt_utils.py +11 -0
  45. sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
  46. sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
  47. sglang/srt/layers/moe/ep_moe/layer.py +8 -286
  48. sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
  49. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  50. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  51. sglang/srt/layers/moe/utils.py +7 -1
  52. sglang/srt/layers/quantization/__init__.py +1 -1
  53. sglang/srt/layers/quantization/fp8.py +84 -18
  54. sglang/srt/layers/quantization/modelopt_quant.py +1 -1
  55. sglang/srt/layers/quantization/quark/quark.py +3 -1
  56. sglang/srt/layers/quantization/w4afp8.py +2 -16
  57. sglang/srt/lora/lora_manager.py +0 -8
  58. sglang/srt/managers/overlap_utils.py +18 -16
  59. sglang/srt/managers/schedule_batch.py +119 -90
  60. sglang/srt/managers/schedule_policy.py +1 -1
  61. sglang/srt/managers/scheduler.py +213 -126
  62. sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
  63. sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
  64. sglang/srt/managers/tokenizer_manager.py +270 -53
  65. sglang/srt/managers/tp_worker.py +39 -28
  66. sglang/srt/mem_cache/allocator.py +7 -2
  67. sglang/srt/mem_cache/chunk_cache.py +1 -1
  68. sglang/srt/mem_cache/memory_pool.py +162 -68
  69. sglang/srt/mem_cache/radix_cache.py +8 -3
  70. sglang/srt/mem_cache/swa_radix_cache.py +70 -14
  71. sglang/srt/model_executor/cuda_graph_runner.py +1 -1
  72. sglang/srt/model_executor/forward_batch_info.py +4 -18
  73. sglang/srt/model_executor/model_runner.py +55 -51
  74. sglang/srt/model_loader/__init__.py +1 -1
  75. sglang/srt/model_loader/loader.py +187 -6
  76. sglang/srt/model_loader/weight_utils.py +3 -0
  77. sglang/srt/models/falcon_h1.py +11 -9
  78. sglang/srt/models/gemma3_mm.py +16 -0
  79. sglang/srt/models/grok.py +5 -13
  80. sglang/srt/models/mixtral.py +1 -3
  81. sglang/srt/models/mllama4.py +11 -1
  82. sglang/srt/models/nemotron_h.py +514 -0
  83. sglang/srt/models/utils.py +5 -1
  84. sglang/srt/sampling/sampling_batch_info.py +11 -9
  85. sglang/srt/server_args.py +100 -33
  86. sglang/srt/speculative/eagle_worker.py +11 -13
  87. sglang/srt/speculative/ngram_worker.py +12 -11
  88. sglang/srt/speculative/spec_utils.py +0 -1
  89. sglang/srt/two_batch_overlap.py +1 -0
  90. sglang/srt/utils/common.py +18 -0
  91. sglang/srt/utils/hf_transformers_utils.py +2 -0
  92. sglang/test/longbench_v2/__init__.py +1 -0
  93. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  94. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  95. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  96. sglang/test/run_eval.py +40 -0
  97. sglang/test/simple_eval_longbench_v2.py +332 -0
  98. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  99. sglang/test/test_deterministic.py +18 -2
  100. sglang/test/test_deterministic_utils.py +81 -0
  101. sglang/test/test_disaggregation_utils.py +63 -0
  102. sglang/test/test_utils.py +32 -11
  103. sglang/version.py +1 -1
  104. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
  105. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
  106. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  107. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  108. sglang/test/test_block_fp8_ep.py +0 -358
  109. /sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
  110. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  111. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  112. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -120,7 +120,7 @@ def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dty
120
120
  )
121
121
  topk_weights, topk_ids, _ = topk_output
122
122
  expert_map = torch.arange(E, dtype=torch.int32, device=device)
123
- expert_map[local_e:] = E
123
+ expert_map[local_e:] = -1
124
124
 
125
125
  output = cutlass_moe(
126
126
  a,
@@ -138,9 +138,7 @@ def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dty
138
138
  c_strides2,
139
139
  s_strides13,
140
140
  s_strides2,
141
- 0,
142
- local_e - 1,
143
- E,
141
+ local_e,
144
142
  a1_scale,
145
143
  a2_scale,
146
144
  expert_map,
@@ -178,7 +176,7 @@ def cutlass_moe(
178
176
  w1_scale: torch.Tensor,
179
177
  w2_scale: torch.Tensor,
180
178
  topk_weights: torch.Tensor,
181
- topk_ids_: torch.Tensor,
179
+ topk_ids: torch.Tensor,
182
180
  a_strides1: torch.Tensor,
183
181
  b_strides1: torch.Tensor,
184
182
  c_strides1: torch.Tensor,
@@ -187,40 +185,32 @@ def cutlass_moe(
187
185
  c_strides2: torch.Tensor,
188
186
  s_strides13: torch.Tensor,
189
187
  s_strides2: torch.Tensor,
190
- start_expert_id: int,
191
- end_expert_id: int,
192
- E: int,
188
+ num_local_experts: int,
193
189
  a1_scale: Optional[torch.Tensor] = None,
194
190
  a2_scale: Optional[torch.Tensor] = None,
195
191
  expert_map: Optional[torch.Tensor] = None,
196
192
  apply_router_weight_on_input: bool = False,
197
193
  ):
198
- local_topk_ids = topk_ids_
199
- local_topk_ids = torch.where(expert_map[topk_ids_] != E, expert_map[topk_ids_], E)
194
+ topk_ids = expert_map[topk_ids]
200
195
  device = a.device
201
196
 
202
- local_num_experts = end_expert_id - start_expert_id + 1
203
197
  expert_offsets = torch.empty(
204
- (local_num_experts + 1), dtype=torch.int32, device=device
198
+ (num_local_experts + 1), dtype=torch.int32, device=device
205
199
  )
206
200
  problem_sizes1 = torch.empty(
207
- (local_num_experts, 3), dtype=torch.int32, device=device
201
+ (num_local_experts, 3), dtype=torch.int32, device=device
208
202
  )
209
203
  problem_sizes2 = torch.empty(
210
- (local_num_experts, 3), dtype=torch.int32, device=device
204
+ (num_local_experts, 3), dtype=torch.int32, device=device
211
205
  )
212
206
  return cutlass_w4a8_moe(
213
- start_expert_id,
214
- end_expert_id,
215
- E,
216
207
  a,
217
208
  w1_q,
218
209
  w2_q,
219
210
  w1_scale,
220
211
  w2_scale,
221
212
  topk_weights,
222
- topk_ids_,
223
- local_topk_ids,
213
+ topk_ids,
224
214
  a_strides1,
225
215
  b_strides1,
226
216
  c_strides1,
@@ -39,12 +39,15 @@ class BenchArgs:
39
39
  profile_steps: int = 3
40
40
  profile_by_stage: bool = False
41
41
  test_mode: str = "single"
42
+ n_trials: int = 50
43
+ n_start: int = 1
42
44
 
43
45
  @staticmethod
44
46
  def add_cli_args(parser: argparse.ArgumentParser):
45
47
  parser.add_argument("--host", type=str, default=BenchArgs.host)
46
48
  parser.add_argument("--port", type=int, default=BenchArgs.port)
47
- parser.add_argument("--n-trials", type=int, default=50)
49
+ parser.add_argument("--n-trials", type=int, default=BenchArgs.n_trials)
50
+ parser.add_argument("--n-start", type=int, default=BenchArgs.n_start)
48
51
  parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
49
52
  parser.add_argument(
50
53
  "--sampling-seed", type=int, default=BenchArgs.sampling_seed
@@ -238,6 +241,8 @@ def test_deterministic(args):
238
241
  texts.append(text)
239
242
 
240
243
  print(f"Total samples: {len(texts)}, Unique samples: {len(set(texts))}")
244
+ return [len(set(texts))]
245
+
241
246
  elif args.test_mode == "mixed":
242
247
  # In mixed mode, we send a mixture of two short prompts and one long prompt in the same batch with batch size ranging from 1 to n_trials.
243
248
  output_prompt_1 = []
@@ -264,13 +269,19 @@ def test_deterministic(args):
264
269
  f"Long prompt: total samples: {len(output_long_prompt)}, Unique samples: {len(set(output_long_prompt))}"
265
270
  )
266
271
 
272
+ return [
273
+ len(set(output_prompt_1)),
274
+ len(set(output_prompt_2)),
275
+ len(set(output_long_prompt)),
276
+ ]
277
+
267
278
  elif args.test_mode == "prefix":
268
279
  # In prefix mode, we create prompts from the same long prompt, with different lengths of common prefix.
269
280
  len_prefix = [1, 511, 2048, 4097]
270
281
  num_prompts = len(len_prefix)
271
282
  outputs = {i: [] for i in range(4)}
272
283
  prompts = [LONG_PROMPT[: len_prefix[i]] for i in range(4)]
273
- for i in range(1, args.n_trials + 1):
284
+ for i in range(args.n_start, args.n_start + args.n_trials):
274
285
  batch_size = i
275
286
  ret_dict = send_prefix(args, batch_size, prompts)
276
287
  msg = f"Testing Trial {i} with batch size {batch_size},"
@@ -285,6 +296,11 @@ def test_deterministic(args):
285
296
  f"Prompt {i} with prefix length {len_prefix[i]}: total samples: {len(outputs[i])}, Unique samples: {len(set(outputs[i]))}"
286
297
  )
287
298
 
299
+ results = []
300
+ for i in range(num_prompts):
301
+ results.append(len(set(outputs[i])))
302
+ return results
303
+
288
304
  else:
289
305
  raise ValueError(f"Invalid test mode: {args.test_mode}")
290
306
 
@@ -0,0 +1,81 @@
1
+ import time
2
+ import unittest
3
+
4
+ import requests
5
+
6
+ from sglang.srt.utils import kill_process_tree
7
+ from sglang.test.test_deterministic import BenchArgs, test_deterministic
8
+ from sglang.test.test_utils import (
9
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
10
+ DEFAULT_URL_FOR_TEST,
11
+ CustomTestCase,
12
+ popen_launch_server,
13
+ )
14
+
15
+ DEFAULT_MODEL = "Qwen/Qwen3-8B"
16
+ COMMON_SERVER_ARGS = [
17
+ "--trust-remote-code",
18
+ "--cuda-graph-max-bs",
19
+ "32",
20
+ "--enable-deterministic-inference",
21
+ ]
22
+
23
+
24
+ class TestDeterministicBase(CustomTestCase):
25
+ @classmethod
26
+ def get_server_args(cls):
27
+ return COMMON_SERVER_ARGS
28
+
29
+ @classmethod
30
+ def setUpClass(cls):
31
+ cls.model = DEFAULT_MODEL
32
+ cls.base_url = DEFAULT_URL_FOR_TEST
33
+ if "--attention-backend" not in cls.get_server_args():
34
+ raise unittest.SkipTest("Skip the base test class")
35
+
36
+ cls.process = popen_launch_server(
37
+ cls.model,
38
+ cls.base_url,
39
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
40
+ other_args=cls.get_server_args(),
41
+ )
42
+
43
+ @classmethod
44
+ def tearDownClass(cls):
45
+ kill_process_tree(cls.process.pid)
46
+
47
+ def _extract_host_and_port(self, url):
48
+ return url.split("://")[-1].split(":")[0], int(url.split(":")[-1])
49
+
50
+ def test_single(self):
51
+ args = BenchArgs()
52
+ url = DEFAULT_URL_FOR_TEST
53
+ args.host, args.port = self._extract_host_and_port(url)
54
+ args.test_mode = "single"
55
+ args.n_start = 10
56
+ args.n_trials = 20
57
+ results = test_deterministic(args)
58
+ for result in results:
59
+ assert result == 1
60
+
61
+ def test_mixed(self):
62
+ args = BenchArgs()
63
+ url = DEFAULT_URL_FOR_TEST
64
+ args.host, args.port = self._extract_host_and_port(url)
65
+ args.test_mode = "mixed"
66
+ args.n_start = 10
67
+ args.n_trials = 20
68
+ results = test_deterministic(args)
69
+ for result in results:
70
+ assert result == 1
71
+
72
+ def test_prefix(self):
73
+ args = BenchArgs()
74
+ url = DEFAULT_URL_FOR_TEST
75
+ args.host, args.port = self._extract_host_and_port(url)
76
+ args.test_mode = "prefix"
77
+ args.n_start = 10
78
+ args.n_trials = 10
79
+ results = test_deterministic(args)
80
+ for result in results:
81
+ assert result == 1
@@ -1,13 +1,17 @@
1
+ import os
1
2
  import time
3
+ import warnings
2
4
  from urllib.parse import urlparse
3
5
 
4
6
  import requests
5
7
 
8
+ from sglang.srt.environ import envs
6
9
  from sglang.srt.utils import kill_process_tree
7
10
  from sglang.test.test_utils import (
8
11
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
9
12
  DEFAULT_URL_FOR_TEST,
10
13
  CustomTestCase,
14
+ is_in_ci,
11
15
  popen_with_error_check,
12
16
  )
13
17
 
@@ -27,6 +31,24 @@ class TestDisaggregationBase(CustomTestCase):
27
31
  print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
28
32
  cls.process_lb, cls.process_decode, cls.process_prefill = None, None, None
29
33
 
34
+ # config transfer backend and rdma devices
35
+ if is_in_ci():
36
+ cls.transfer_backend = ["--disaggregation-transfer-backend", "mooncake"]
37
+ cls.rdma_devices = ["--disaggregation-ib-device", get_rdma_devices_args()]
38
+ else:
39
+ cls.transfer_backend = [
40
+ "--disaggregation-transfer-backend",
41
+ envs.SGLANG_TEST_PD_DISAGG_BACKEND.get(),
42
+ ]
43
+ cls.rdma_devices = [
44
+ "--disaggregation-ib-device",
45
+ envs.SGLANG_TEST_PD_DISAGG_DEVICES.get(),
46
+ ]
47
+ if cls.rdma_devices[1] is None:
48
+ cls.rdma_devices = []
49
+ msg = "No RDMA devices specified for disaggregation test, using default settings."
50
+ warnings.warn(msg)
51
+
30
52
  @classmethod
31
53
  def launch_lb(cls):
32
54
  lb_command = [
@@ -75,3 +97,44 @@ class TestDisaggregationBase(CustomTestCase):
75
97
 
76
98
  # wait for 5 seconds
77
99
  time.sleep(5)
100
+
101
+
102
+ def get_rdma_devices_args():
103
+ # 1. Get visible GPU indices
104
+ cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
105
+ if not cuda_visible_devices:
106
+ warnings.warn("CUDA_VISIBLE_DEVICES is not set. Using default RDMA devices.")
107
+ return "mlx5_roce0,mlx5_roce4"
108
+
109
+ try:
110
+ # Convert to list of integers (handling possible spaces and empty strings)
111
+ gpu_indices = [
112
+ int(idx.strip()) for idx in cuda_visible_devices.split(",") if idx.strip()
113
+ ]
114
+ if not gpu_indices or len(gpu_indices) > 4:
115
+ return "mlx5_roce0,mlx5_roce4"
116
+ except ValueError:
117
+ warnings.warn(f"Invalid CUDA_VISIBLE_DEVICES format: {cuda_visible_devices}")
118
+ return "mlx5_roce0,mlx5_roce4"
119
+
120
+ # 2. Calculate base RDMA index group (each group of 4 GPUs uses consecutive devices)
121
+ base_rdma_group = min(gpu_indices) // 4 * 4
122
+
123
+ # 3. Generate RDMA device names
124
+ rdma_devices = []
125
+ for gpu_idx in gpu_indices:
126
+ # Validate GPU index within expected range
127
+ if gpu_idx < base_rdma_group or gpu_idx >= base_rdma_group + 4:
128
+ warnings.warn(
129
+ f"GPU index {gpu_idx} is outside expected group {base_rdma_group}-{base_rdma_group+3}"
130
+ )
131
+ continue
132
+
133
+ # Map GPU index to RDMA device index
134
+ rdma_index = base_rdma_group // 4 * 4 + (gpu_idx % 4)
135
+ rdma_devices.append(f"mlx5_roce{rdma_index}")
136
+
137
+ if not rdma_devices:
138
+ return "mlx5_roce0,mlx5_roce4"
139
+
140
+ return ",".join(rdma_devices)
sglang/test/test_utils.py CHANGED
@@ -20,7 +20,6 @@ from functools import partial
20
20
  from pathlib import Path
21
21
  from types import SimpleNamespace
22
22
  from typing import Any, Awaitable, Callable, List, Optional, Tuple
23
- from urllib.parse import quote
24
23
 
25
24
  import aiohttp
26
25
  import numpy as np
@@ -509,6 +508,7 @@ def popen_launch_server(
509
508
  return_stdout_stderr: Optional[tuple] = None,
510
509
  device: str = "auto",
511
510
  pd_separated: bool = False,
511
+ num_replicas: Optional[int] = None,
512
512
  ):
513
513
  """Launch a server process with automatic device detection.
514
514
 
@@ -526,7 +526,8 @@ def popen_launch_server(
526
526
  _, host, port = base_url.split(":")
527
527
  host = host[2:]
528
528
 
529
- if pd_separated:
529
+ use_mixed_pd_engine = not pd_separated and num_replicas is not None
530
+ if pd_separated or use_mixed_pd_engine:
530
531
  command = "sglang.launch_pd_server"
531
532
  else:
532
533
  command = "sglang.launch_server"
@@ -540,7 +541,7 @@ def popen_launch_server(
540
541
  *[str(x) for x in other_args],
541
542
  ]
542
543
 
543
- if pd_separated:
544
+ if pd_separated or use_mixed_pd_engine:
544
545
  command.extend(
545
546
  [
546
547
  "--lb-host",
@@ -559,6 +560,15 @@ def popen_launch_server(
559
560
  ]
560
561
  )
561
562
 
563
+ if use_mixed_pd_engine:
564
+ command.extend(
565
+ [
566
+ "--mixed",
567
+ "--num-replicas",
568
+ str(num_replicas),
569
+ ]
570
+ )
571
+
562
572
  if api_key:
563
573
  command += ["--api-key", api_key]
564
574
 
@@ -1149,7 +1159,7 @@ def run_bench_offline_throughput(model, other_args):
1149
1159
  *[str(x) for x in other_args],
1150
1160
  ]
1151
1161
 
1152
- print(f"{command=}")
1162
+ print(f"command={' '.join(command)}")
1153
1163
  process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1154
1164
 
1155
1165
  try:
@@ -1641,15 +1651,26 @@ def _ensure_remove_suffix(text: str, suffix: str):
1641
1651
  return text.removesuffix(suffix)
1642
1652
 
1643
1653
 
1644
- class ModelDeploySetup:
1645
- def __init__(self, model_path: str, extra_args: List[str] = []):
1654
+ class ModelLaunchSettings:
1655
+ def __init__(
1656
+ self,
1657
+ model_path: str,
1658
+ tp_size: int = 1,
1659
+ extra_args: Optional[List[str]] = None,
1660
+ env: Optional[dict] = None,
1661
+ ):
1646
1662
  self.model_path = model_path
1647
- if "--enable-multimodal" not in extra_args:
1648
- extra_args.append("--enable-multimodal")
1649
- if "--trust-remote-code" not in extra_args:
1650
- extra_args.append("--trust-remote-code")
1663
+ self.tp_size = tp_size
1664
+ self.extra_args = list(extra_args) if extra_args else []
1665
+ self.env = env
1666
+
1667
+ if self.tp_size > 1 and "--tp" not in self.extra_args:
1668
+ self.extra_args.extend(["--tp", str(self.tp_size)])
1651
1669
 
1652
- self.extra_args = extra_args
1670
+ fixed_args = ["--enable-multimodal", "--trust-remote-code"]
1671
+ for fixed_arg in fixed_args:
1672
+ if fixed_arg not in self.extra_args:
1673
+ self.extra_args.append(fixed_arg)
1653
1674
 
1654
1675
 
1655
1676
  class ModelEvalMetrics:
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.3"
1
+ __version__ = "0.5.3.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.3
3
+ Version: 0.5.3.post1
4
4
  Summary: SGLang is a fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -221,7 +221,7 @@ Requires-Dist: cuda-python
221
221
  Requires-Dist: datasets
222
222
  Requires-Dist: einops
223
223
  Requires-Dist: fastapi
224
- Requires-Dist: flashinfer_python==0.4.0rc3
224
+ Requires-Dist: flashinfer_python==0.4.0
225
225
  Requires-Dist: hf_transfer
226
226
  Requires-Dist: huggingface_hub
227
227
  Requires-Dist: interegular
@@ -250,7 +250,7 @@ Requires-Dist: requests
250
250
  Requires-Dist: scipy
251
251
  Requires-Dist: sentencepiece
252
252
  Requires-Dist: setproctitle
253
- Requires-Dist: sgl-kernel==0.3.14.post1
253
+ Requires-Dist: sgl-kernel==0.3.15
254
254
  Requires-Dist: soundfile==0.13.1
255
255
  Requires-Dist: tiktoken
256
256
  Requires-Dist: timm==1.0.16
@@ -263,7 +263,7 @@ Requires-Dist: tqdm
263
263
  Requires-Dist: transformers==4.57.0
264
264
  Requires-Dist: uvicorn
265
265
  Requires-Dist: uvloop
266
- Requires-Dist: xgrammar==0.1.24
266
+ Requires-Dist: xgrammar==0.1.25
267
267
  Requires-Dist: grpcio==1.75.1
268
268
  Requires-Dist: grpcio-tools==1.75.1
269
269
  Provides-Extra: decord