sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -0
- sglang/check_env.py +3 -3
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/kimi_vl.py +38 -0
- sglang/srt/configs/kimi_vl_moonvit.py +32 -0
- sglang/srt/configs/model_config.py +15 -0
- sglang/srt/conversation.py +122 -1
- sglang/srt/entrypoints/engine.py +44 -22
- sglang/srt/function_call_parser.py +97 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +107 -82
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
- sglang/srt/layers/attention/flashmla_backend.py +3 -0
- sglang/srt/layers/dp_attention.py +5 -2
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -6
- sglang/srt/layers/quantization/__init__.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +1 -1
- sglang/srt/layers/utils.py +35 -0
- sglang/srt/lora/layers.py +35 -9
- sglang/srt/lora/lora_manager.py +84 -35
- sglang/srt/managers/data_parallel_controller.py +52 -34
- sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
- sglang/srt/managers/schedule_batch.py +25 -15
- sglang/srt/managers/scheduler.py +263 -59
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
- sglang/srt/managers/tp_worker.py +51 -16
- sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
- sglang/srt/mem_cache/memory_pool.py +70 -36
- sglang/srt/model_executor/cuda_graph_runner.py +82 -19
- sglang/srt/model_executor/forward_batch_info.py +31 -1
- sglang/srt/model_executor/model_runner.py +115 -57
- sglang/srt/models/deepseek_nextn.py +1 -257
- sglang/srt/models/deepseek_v2.py +78 -18
- sglang/srt/models/kimi_vl.py +308 -0
- sglang/srt/models/kimi_vl_moonvit.py +639 -0
- sglang/srt/models/llama.py +92 -30
- sglang/srt/models/llama4.py +2 -1
- sglang/srt/models/llama_eagle.py +4 -1
- sglang/srt/models/llama_eagle3.py +4 -1
- sglang/srt/models/qwen2_moe.py +8 -3
- sglang/srt/models/qwen2_vl.py +0 -12
- sglang/srt/models/qwen3_moe.py +8 -3
- sglang/srt/openai_api/adapter.py +34 -22
- sglang/srt/openai_api/protocol.py +11 -1
- sglang/srt/server_args.py +67 -22
- sglang/srt/speculative/eagle_worker.py +3 -2
- sglang/srt/utils.py +88 -9
- sglang/test/runners.py +4 -0
- sglang/test/test_utils.py +29 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +61 -51
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -22,7 +22,7 @@ import random
|
|
22
22
|
import tempfile
|
23
23
|
from typing import List, Literal, Optional
|
24
24
|
|
25
|
-
from sglang.srt.hf_transformers_utils import check_gguf_file
|
25
|
+
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
26
26
|
from sglang.srt.reasoning_parser import ReasoningParser
|
27
27
|
from sglang.srt.utils import (
|
28
28
|
configure_ipv6,
|
@@ -78,6 +78,8 @@ class ServerArgs:
|
|
78
78
|
|
79
79
|
# Other runtime options
|
80
80
|
tp_size: int = 1
|
81
|
+
pp_size: int = 1
|
82
|
+
max_micro_batch_size: Optional[int] = None
|
81
83
|
stream_interval: int = 1
|
82
84
|
stream_output: bool = False
|
83
85
|
random_seed: Optional[int] = None
|
@@ -222,25 +224,34 @@ class ServerArgs:
|
|
222
224
|
|
223
225
|
# Set mem fraction static, which depends on the tensor parallelism size
|
224
226
|
if self.mem_fraction_static is None:
|
227
|
+
parallel_size = self.tp_size * self.pp_size
|
225
228
|
if gpu_mem <= 81920:
|
226
|
-
if
|
229
|
+
if parallel_size >= 16:
|
227
230
|
self.mem_fraction_static = 0.79
|
228
|
-
elif
|
231
|
+
elif parallel_size >= 8:
|
229
232
|
self.mem_fraction_static = 0.81
|
230
|
-
elif
|
233
|
+
elif parallel_size >= 4:
|
231
234
|
self.mem_fraction_static = 0.85
|
232
|
-
elif
|
235
|
+
elif parallel_size >= 2:
|
233
236
|
self.mem_fraction_static = 0.87
|
234
237
|
else:
|
235
238
|
self.mem_fraction_static = 0.88
|
236
239
|
else:
|
237
|
-
|
238
|
-
|
240
|
+
self.mem_fraction_static = 0.88
|
241
|
+
if gpu_mem > 96 * 1024:
|
242
|
+
mem_fraction = self.mem_fraction_static
|
243
|
+
self.mem_fraction_static = min(
|
244
|
+
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
245
|
+
(gpu_mem - 1024 * 18)
|
246
|
+
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
|
247
|
+
)
|
239
248
|
|
240
249
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
241
250
|
if self.chunked_prefill_size is None:
|
242
251
|
if gpu_mem is not None and gpu_mem < 25_000:
|
243
252
|
self.chunked_prefill_size = 2048
|
253
|
+
elif self.disaggregation_mode != "null":
|
254
|
+
self.chunked_prefill_size = 16384
|
244
255
|
else:
|
245
256
|
self.chunked_prefill_size = 8192
|
246
257
|
assert self.chunked_prefill_size % self.page_size == 0
|
@@ -333,6 +344,14 @@ class ServerArgs:
|
|
333
344
|
"eagle speculative decoding."
|
334
345
|
)
|
335
346
|
|
347
|
+
model_arch = get_model_arch(self)
|
348
|
+
|
349
|
+
# Auto set draft_model_path DeepSeek-V3/R1
|
350
|
+
if self.speculative_draft_model_path is None and model_arch in [
|
351
|
+
"DeepseekV3ForCausalLM"
|
352
|
+
]:
|
353
|
+
self.speculative_draft_model_path = self.model_path
|
354
|
+
|
336
355
|
# Auto choose parameters
|
337
356
|
if self.speculative_num_steps is None:
|
338
357
|
assert (
|
@@ -343,7 +362,7 @@ class ServerArgs:
|
|
343
362
|
self.speculative_num_steps,
|
344
363
|
self.speculative_eagle_topk,
|
345
364
|
self.speculative_num_draft_tokens,
|
346
|
-
) = auto_choose_speculative_params(
|
365
|
+
) = auto_choose_speculative_params(model_arch)
|
347
366
|
|
348
367
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
349
368
|
self.speculative_eagle_topk = 1
|
@@ -632,6 +651,19 @@ class ServerArgs:
|
|
632
651
|
default=ServerArgs.tp_size,
|
633
652
|
help="The tensor parallelism size.",
|
634
653
|
)
|
654
|
+
parser.add_argument(
|
655
|
+
"--pipeline-parallel-size",
|
656
|
+
"--pp-size",
|
657
|
+
type=int,
|
658
|
+
default=ServerArgs.pp_size,
|
659
|
+
help="The pipeline parallelism size.",
|
660
|
+
)
|
661
|
+
parser.add_argument(
|
662
|
+
"--max-micro-batch-size",
|
663
|
+
type=int,
|
664
|
+
default=ServerArgs.max_micro_batch_size,
|
665
|
+
help="The maximum micro batch size in pipeline parallelism.",
|
666
|
+
)
|
635
667
|
parser.add_argument(
|
636
668
|
"--stream-interval",
|
637
669
|
type=int,
|
@@ -1096,9 +1128,9 @@ class ServerArgs:
|
|
1096
1128
|
parser.add_argument(
|
1097
1129
|
"--tool-call-parser",
|
1098
1130
|
type=str,
|
1099
|
-
choices=["qwen25", "mistral", "llama3", "deepseekv3"],
|
1131
|
+
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
|
1100
1132
|
default=ServerArgs.tool_call_parser,
|
1101
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and '
|
1133
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
|
1102
1134
|
)
|
1103
1135
|
parser.add_argument(
|
1104
1136
|
"--enable-hierarchical-cache",
|
@@ -1221,6 +1253,7 @@ class ServerArgs:
|
|
1221
1253
|
@classmethod
|
1222
1254
|
def from_cli_args(cls, args: argparse.Namespace):
|
1223
1255
|
args.tp_size = args.tensor_parallel_size
|
1256
|
+
args.pp_size = args.pipeline_parallel_size
|
1224
1257
|
args.dp_size = args.data_parallel_size
|
1225
1258
|
args.ep_size = args.expert_parallel_size
|
1226
1259
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
@@ -1234,15 +1267,25 @@ class ServerArgs:
|
|
1234
1267
|
|
1235
1268
|
def check_server_args(self):
|
1236
1269
|
assert (
|
1237
|
-
self.tp_size
|
1238
|
-
), "tp_size must be divisible by number of nodes"
|
1270
|
+
self.tp_size * self.pp_size
|
1271
|
+
) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
|
1272
|
+
|
1273
|
+
# FIXME pp constraints
|
1274
|
+
if self.pp_size > 1:
|
1275
|
+
logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
|
1276
|
+
self.disable_overlap_schedule = True
|
1277
|
+
assert (
|
1278
|
+
self.disable_overlap_schedule
|
1279
|
+
and self.speculative_algorithm is None
|
1280
|
+
and not self.enable_mixed_chunk
|
1281
|
+
), "Pipeline parallelism is not compatible with overlap schedule, speculative decoding, mixed chunked prefill."
|
1282
|
+
|
1239
1283
|
assert not (
|
1240
1284
|
self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
|
1241
1285
|
), "multi-node data parallel is not supported unless dp attention!"
|
1242
1286
|
assert (
|
1243
1287
|
self.max_loras_per_batch > 0
|
1244
1288
|
# FIXME
|
1245
|
-
and (self.lora_paths is None or self.disable_cuda_graph)
|
1246
1289
|
and (self.lora_paths is None or self.disable_radix_cache)
|
1247
1290
|
), "compatibility of lora and cuda graph and radix attention is in progress"
|
1248
1291
|
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
@@ -1368,20 +1411,22 @@ class DeprecatedAction(argparse.Action):
|
|
1368
1411
|
raise ValueError(self.help)
|
1369
1412
|
|
1370
1413
|
|
1371
|
-
def
|
1414
|
+
def get_model_arch(args: ServerArgs):
|
1415
|
+
hf_config = get_config(
|
1416
|
+
args.model_path,
|
1417
|
+
trust_remote_code=args.trust_remote_code,
|
1418
|
+
revision=args.revision,
|
1419
|
+
model_override_args=json.loads(args.json_model_override_args),
|
1420
|
+
)
|
1421
|
+
return hf_config.architectures[0]
|
1422
|
+
|
1423
|
+
|
1424
|
+
def auto_choose_speculative_params(arch: str):
|
1372
1425
|
"""
|
1373
1426
|
Automatically choose the parameters for speculative decoding.
|
1374
1427
|
|
1375
1428
|
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
|
1376
1429
|
"""
|
1377
|
-
config_path = os.path.join(self.model_path, "config.json")
|
1378
|
-
if not os.path.exists(config_path):
|
1379
|
-
raise ValueError(f"{config_path} is not found.")
|
1380
|
-
|
1381
|
-
config = json.load(open(config_path))
|
1382
|
-
|
1383
|
-
arch = config.get("architectures", ["Unknown"])[0]
|
1384
|
-
|
1385
1430
|
if arch in ["LlamaForCausalLM"]:
|
1386
1431
|
# The default value for llama
|
1387
1432
|
return (5, 4, 8)
|
@@ -106,11 +106,12 @@ class EAGLEWorker(TpModelWorker):
|
|
106
106
|
# Init draft worker
|
107
107
|
with empty_context():
|
108
108
|
super().__init__(
|
109
|
+
server_args=server_args,
|
109
110
|
gpu_id=gpu_id,
|
110
111
|
tp_rank=tp_rank,
|
111
|
-
|
112
|
-
nccl_port=nccl_port,
|
112
|
+
pp_rank=0, # FIXME
|
113
113
|
dp_rank=dp_rank,
|
114
|
+
nccl_port=nccl_port,
|
114
115
|
is_draft_worker=True,
|
115
116
|
req_to_token_pool=self.req_to_token_pool,
|
116
117
|
token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
|
sglang/srt/utils.py
CHANGED
@@ -12,6 +12,7 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
"""Common utilities."""
|
15
|
+
|
15
16
|
import base64
|
16
17
|
import builtins
|
17
18
|
import ctypes
|
@@ -414,16 +415,40 @@ class LayerFn(Protocol):
|
|
414
415
|
def make_layers(
|
415
416
|
num_hidden_layers: int,
|
416
417
|
layer_fn: LayerFn,
|
418
|
+
pp_rank: Optional[int] = None,
|
419
|
+
pp_size: Optional[int] = None,
|
417
420
|
prefix: str = "",
|
421
|
+
return_tuple: bool = False,
|
418
422
|
) -> Tuple[int, int, torch.nn.ModuleList]:
|
419
423
|
"""Make a list of layers with the given layer function"""
|
424
|
+
# circula imports
|
425
|
+
from sglang.srt.distributed import get_pp_indices
|
426
|
+
from sglang.srt.layers.utils import PPMissingLayer
|
427
|
+
|
428
|
+
assert not pp_size or num_hidden_layers >= pp_size
|
429
|
+
start_layer, end_layer = (
|
430
|
+
get_pp_indices(
|
431
|
+
num_hidden_layers,
|
432
|
+
pp_rank,
|
433
|
+
pp_size,
|
434
|
+
)
|
435
|
+
if pp_rank is not None and pp_size is not None
|
436
|
+
else (0, num_hidden_layers)
|
437
|
+
)
|
420
438
|
modules = torch.nn.ModuleList(
|
421
|
-
[
|
439
|
+
[PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
|
440
|
+
+ [
|
422
441
|
maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
|
423
|
-
for idx in range(
|
442
|
+
for idx in range(start_layer, end_layer)
|
443
|
+
]
|
444
|
+
+ [
|
445
|
+
PPMissingLayer(return_tuple=return_tuple)
|
446
|
+
for _ in range(end_layer, num_hidden_layers)
|
424
447
|
]
|
425
448
|
)
|
426
|
-
|
449
|
+
if pp_rank is None or pp_size is None:
|
450
|
+
return modules
|
451
|
+
return modules, start_layer, end_layer
|
427
452
|
|
428
453
|
|
429
454
|
def set_random_seed(seed: int) -> None:
|
@@ -877,7 +902,7 @@ def broadcast_pyobj(
|
|
877
902
|
"cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
|
878
903
|
)
|
879
904
|
|
880
|
-
if rank ==
|
905
|
+
if rank == src:
|
881
906
|
if len(data) == 0:
|
882
907
|
tensor_size = torch.tensor([0], dtype=torch.long, device=device)
|
883
908
|
dist.broadcast(tensor_size, src=src, group=dist_group)
|
@@ -909,6 +934,50 @@ def broadcast_pyobj(
|
|
909
934
|
return data
|
910
935
|
|
911
936
|
|
937
|
+
def point_to_point_pyobj(
|
938
|
+
data: List[Any],
|
939
|
+
rank: int,
|
940
|
+
group: Optional[torch.distributed.ProcessGroup] = None,
|
941
|
+
src: int = 0,
|
942
|
+
dst: int = 1,
|
943
|
+
):
|
944
|
+
"""Send data from src to dst in group."""
|
945
|
+
|
946
|
+
if rank == src:
|
947
|
+
if len(data) == 0:
|
948
|
+
tensor_size = torch.tensor([0], dtype=torch.long)
|
949
|
+
dist.send(tensor_size, dst=dst, group=group)
|
950
|
+
else:
|
951
|
+
serialized_data = pickle.dumps(data)
|
952
|
+
size = len(serialized_data)
|
953
|
+
tensor_data = torch.ByteTensor(
|
954
|
+
np.frombuffer(serialized_data, dtype=np.uint8)
|
955
|
+
)
|
956
|
+
tensor_size = torch.tensor([size], dtype=torch.long)
|
957
|
+
|
958
|
+
dist.send(tensor_size, dst=dst, group=group)
|
959
|
+
dist.send(tensor_data, dst=dst, group=group)
|
960
|
+
return data
|
961
|
+
|
962
|
+
elif rank == dst:
|
963
|
+
tensor_size = torch.tensor([0], dtype=torch.long)
|
964
|
+
dist.recv(tensor_size, src=src, group=group)
|
965
|
+
size = tensor_size.item()
|
966
|
+
|
967
|
+
if size == 0:
|
968
|
+
return []
|
969
|
+
|
970
|
+
tensor_data = torch.empty(size, dtype=torch.uint8)
|
971
|
+
dist.recv(tensor_data, src=src, group=group)
|
972
|
+
|
973
|
+
serialized_data = bytes(tensor_data.cpu().numpy())
|
974
|
+
data = pickle.loads(serialized_data)
|
975
|
+
return data
|
976
|
+
|
977
|
+
# Other ranks in pp_group do nothing
|
978
|
+
return []
|
979
|
+
|
980
|
+
|
912
981
|
step_counter = 0
|
913
982
|
|
914
983
|
|
@@ -1732,6 +1801,13 @@ def configure_ipv6(dist_init_addr):
|
|
1732
1801
|
return port, host
|
1733
1802
|
|
1734
1803
|
|
1804
|
+
def rank0_log(msg: str):
|
1805
|
+
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1806
|
+
|
1807
|
+
if get_tensor_model_parallel_rank() == 0:
|
1808
|
+
logger.info(msg)
|
1809
|
+
|
1810
|
+
|
1735
1811
|
def rank0_print(msg: str):
|
1736
1812
|
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1737
1813
|
|
@@ -1905,13 +1981,16 @@ def fast_topk(values, topk, dim):
|
|
1905
1981
|
return torch.topk(values, topk, dim=dim)
|
1906
1982
|
|
1907
1983
|
|
1908
|
-
def
|
1984
|
+
def _check(cc_major):
|
1909
1985
|
if not is_cuda():
|
1910
1986
|
return False
|
1911
|
-
|
1912
|
-
|
1913
|
-
|
1914
|
-
|
1987
|
+
return torch.cuda.get_device_capability()[0] == cc_major and tuple(
|
1988
|
+
map(int, torch.version.cuda.split(".")[:2])
|
1989
|
+
) >= (12, 3)
|
1990
|
+
|
1991
|
+
|
1992
|
+
is_ampere_with_cuda_12_3 = lambda: _check(8)
|
1993
|
+
is_hopper_with_cuda_12_3 = lambda: _check(9)
|
1915
1994
|
|
1916
1995
|
|
1917
1996
|
def get_free_port():
|
sglang/test/runners.py
CHANGED
@@ -423,6 +423,10 @@ class HFRunner:
|
|
423
423
|
)
|
424
424
|
del input_logits
|
425
425
|
|
426
|
+
if lora_paths is not None and lora_paths[i] is not None:
|
427
|
+
# Unload the LoRA adapter if it is used
|
428
|
+
model.unload()
|
429
|
+
|
426
430
|
return ModelOutput(
|
427
431
|
output_strs=output_strs,
|
428
432
|
top_input_logprobs=top_input_logprobs,
|
sglang/test/test_utils.py
CHANGED
@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
|
69
69
|
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
70
70
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
71
71
|
)
|
72
|
+
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
|
72
73
|
|
73
74
|
# Nightly tests
|
74
75
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
@@ -770,6 +771,34 @@ def run_bench_offline_throughput(model, other_args):
|
|
770
771
|
return output_throughput
|
771
772
|
|
772
773
|
|
774
|
+
def run_bench_one_batch_server(
|
775
|
+
model,
|
776
|
+
base_url,
|
777
|
+
server_args,
|
778
|
+
bench_args,
|
779
|
+
other_server_args,
|
780
|
+
simulate_spec_acc_lens=None,
|
781
|
+
):
|
782
|
+
from sglang.bench_one_batch_server import run_benchmark
|
783
|
+
|
784
|
+
if simulate_spec_acc_lens is not None:
|
785
|
+
env = {**os.environ, "SIMULATE_ACC_LEN": str(simulate_spec_acc_lens)}
|
786
|
+
else:
|
787
|
+
env = None
|
788
|
+
|
789
|
+
process = popen_launch_server(
|
790
|
+
model,
|
791
|
+
base_url,
|
792
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
793
|
+
other_args=other_server_args,
|
794
|
+
env=env,
|
795
|
+
)
|
796
|
+
try:
|
797
|
+
run_benchmark(server_args=server_args, bench_args=bench_args)
|
798
|
+
finally:
|
799
|
+
kill_process_tree(process.pid)
|
800
|
+
|
801
|
+
|
773
802
|
def lcs(X, Y):
|
774
803
|
m = len(X)
|
775
804
|
n = len(Y)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.6.
|
1
|
+
__version__ = "0.4.6.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.6.
|
3
|
+
Version: 0.4.6.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -238,15 +238,16 @@ Requires-Dist: pynvml; extra == "runtime-common"
|
|
238
238
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
239
239
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
240
240
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
241
|
-
Requires-Dist: torchao>=0.
|
241
|
+
Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
|
242
242
|
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
243
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
244
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
245
|
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
246
|
+
Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
246
247
|
Provides-Extra: srt
|
247
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
248
|
-
Requires-Dist: sgl-kernel==0.1.
|
249
|
-
Requires-Dist: flashinfer_python==0.2.
|
249
|
+
Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
|
250
|
+
Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
|
250
251
|
Requires-Dist: torch==2.6.0; extra == "srt"
|
251
252
|
Requires-Dist: torchvision==0.21.0; extra == "srt"
|
252
253
|
Requires-Dist: cuda-python; extra == "srt"
|