sglang 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +8 -6
- sglang/srt/_custom_ops.py +2 -2
- sglang/srt/code_completion_parser.py +2 -44
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constants.py +3 -0
- sglang/srt/conversation.py +14 -3
- sglang/srt/custom_op.py +11 -1
- sglang/srt/disaggregation/base/conn.py +2 -0
- sglang/srt/disaggregation/decode.py +22 -28
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
- sglang/srt/disaggregation/mini_lb.py +34 -4
- sglang/srt/disaggregation/mooncake/conn.py +301 -64
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
- sglang/srt/disaggregation/nixl/conn.py +94 -46
- sglang/srt/disaggregation/prefill.py +20 -15
- sglang/srt/disaggregation/utils.py +47 -18
- sglang/srt/distributed/parallel_state.py +12 -4
- sglang/srt/entrypoints/engine.py +27 -31
- sglang/srt/entrypoints/http_server.py +149 -79
- sglang/srt/entrypoints/http_server_engine.py +0 -3
- sglang/srt/entrypoints/openai/__init__.py +0 -0
- sglang/srt/{openai_api → entrypoints/openai}/protocol.py +115 -34
- sglang/srt/entrypoints/openai/serving_base.py +149 -0
- sglang/srt/entrypoints/openai/serving_chat.py +897 -0
- sglang/srt/entrypoints/openai/serving_completions.py +425 -0
- sglang/srt/entrypoints/openai/serving_embedding.py +170 -0
- sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
- sglang/srt/entrypoints/openai/serving_score.py +61 -0
- sglang/srt/entrypoints/openai/usage_processor.py +81 -0
- sglang/srt/entrypoints/openai/utils.py +72 -0
- sglang/srt/function_call/base_format_detector.py +7 -4
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/function_call/ebnf_composer.py +64 -10
- sglang/srt/function_call/function_call_parser.py +6 -6
- sglang/srt/function_call/llama32_detector.py +1 -1
- sglang/srt/function_call/mistral_detector.py +1 -1
- sglang/srt/function_call/pythonic_detector.py +1 -1
- sglang/srt/function_call/qwen25_detector.py +1 -1
- sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
- sglang/srt/layers/activation.py +28 -3
- sglang/srt/layers/attention/aiter_backend.py +5 -2
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
- sglang/srt/layers/attention/flashattention_backend.py +43 -23
- sglang/srt/layers/attention/flashinfer_backend.py +9 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
- sglang/srt/layers/attention/flashmla_backend.py +5 -2
- sglang/srt/layers/attention/tbo_backend.py +3 -3
- sglang/srt/layers/attention/triton_backend.py +19 -11
- sglang/srt/layers/communicator.py +5 -5
- sglang/srt/layers/dp_attention.py +11 -2
- sglang/srt/layers/layernorm.py +44 -2
- sglang/srt/layers/linear.py +18 -1
- sglang/srt/layers/logits_processor.py +14 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
- sglang/srt/layers/moe/ep_moe/layer.py +286 -13
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
- sglang/srt/layers/moe/fused_moe_native.py +7 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +148 -26
- sglang/srt/layers/moe/topk.py +117 -4
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
- sglang/srt/layers/quantization/fp8.py +25 -17
- sglang/srt/layers/quantization/fp8_utils.py +5 -4
- sglang/srt/layers/quantization/modelopt_quant.py +62 -8
- sglang/srt/layers/quantization/utils.py +5 -2
- sglang/srt/layers/rotary_embedding.py +144 -12
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/layers/vocab_parallel_embedding.py +14 -1
- sglang/srt/lora/lora_manager.py +173 -74
- sglang/srt/lora/mem_pool.py +49 -45
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/cache_controller.py +33 -15
- sglang/srt/managers/expert_distribution.py +21 -0
- sglang/srt/managers/io_struct.py +19 -14
- sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
- sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
- sglang/srt/managers/schedule_batch.py +49 -32
- sglang/srt/managers/schedule_policy.py +70 -56
- sglang/srt/managers/scheduler.py +189 -68
- sglang/srt/managers/template_manager.py +226 -0
- sglang/srt/managers/tokenizer_manager.py +11 -8
- sglang/srt/managers/tp_worker.py +12 -2
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
- sglang/srt/mem_cache/base_prefix_cache.py +52 -8
- sglang/srt/mem_cache/chunk_cache.py +11 -16
- sglang/srt/mem_cache/hiradix_cache.py +34 -23
- sglang/srt/mem_cache/memory_pool.py +118 -114
- sglang/srt/mem_cache/radix_cache.py +20 -16
- sglang/srt/model_executor/cuda_graph_runner.py +77 -46
- sglang/srt/model_executor/forward_batch_info.py +18 -5
- sglang/srt/model_executor/model_runner.py +27 -8
- sglang/srt/model_loader/loader.py +50 -8
- sglang/srt/model_loader/weight_utils.py +100 -2
- sglang/srt/models/deepseek_nextn.py +35 -30
- sglang/srt/models/deepseek_v2.py +255 -30
- sglang/srt/models/gemma3n_audio.py +949 -0
- sglang/srt/models/gemma3n_causal.py +1009 -0
- sglang/srt/models/gemma3n_mm.py +511 -0
- sglang/srt/models/glm4.py +312 -0
- sglang/srt/models/hunyuan.py +771 -0
- sglang/srt/models/mimo_mtp.py +2 -18
- sglang/srt/reasoning_parser.py +21 -11
- sglang/srt/server_args.py +51 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
- sglang/srt/speculative/eagle_utils.py +80 -8
- sglang/srt/speculative/eagle_worker.py +124 -41
- sglang/srt/torch_memory_saver_adapter.py +19 -15
- sglang/srt/two_batch_overlap.py +4 -1
- sglang/srt/utils.py +248 -11
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_utils.py +1 -0
- sglang/version.py +1 -1
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +4 -10
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +121 -105
- sglang/srt/entrypoints/verl_engine.py +0 -179
- sglang/srt/openai_api/adapter.py +0 -2148
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -160,7 +160,7 @@ def is_npu() -> bool:
|
|
160
160
|
return hasattr(torch, "npu") and torch.npu.is_available()
|
161
161
|
|
162
162
|
|
163
|
-
def
|
163
|
+
def is_host_cpu_x86() -> bool:
|
164
164
|
machine = platform.machine().lower()
|
165
165
|
return (
|
166
166
|
machine in ("x86_64", "amd64", "i386", "i686")
|
@@ -169,6 +169,10 @@ def is_cpu() -> bool:
|
|
169
169
|
)
|
170
170
|
|
171
171
|
|
172
|
+
def is_cpu() -> bool:
|
173
|
+
return os.getenv("SGLANG_USE_CPU_ENGINE", "0") == "1" and is_host_cpu_x86()
|
174
|
+
|
175
|
+
|
172
176
|
def is_flashinfer_available():
|
173
177
|
"""
|
174
178
|
Check whether flashinfer is available.
|
@@ -1291,6 +1295,15 @@ def get_hpu_memory_capacity():
|
|
1291
1295
|
)
|
1292
1296
|
|
1293
1297
|
|
1298
|
+
def get_npu_memory_capacity():
|
1299
|
+
try:
|
1300
|
+
import torch_npu
|
1301
|
+
|
1302
|
+
return torch.npu.mem_get_info()[1] // 1024 // 1024 # unit: MB
|
1303
|
+
except ImportError as e:
|
1304
|
+
raise ImportError("torch_npu is required when run on npu device.")
|
1305
|
+
|
1306
|
+
|
1294
1307
|
def get_device_memory_capacity(device: str = None):
|
1295
1308
|
if is_cuda():
|
1296
1309
|
gpu_mem = get_nvgpu_memory_capacity()
|
@@ -1298,6 +1311,8 @@ def get_device_memory_capacity(device: str = None):
|
|
1298
1311
|
gpu_mem = get_amdgpu_memory_capacity()
|
1299
1312
|
elif device == "hpu":
|
1300
1313
|
gpu_mem = get_hpu_memory_capacity()
|
1314
|
+
elif device == "npu":
|
1315
|
+
gpu_mem = get_npu_memory_capacity()
|
1301
1316
|
else:
|
1302
1317
|
# GPU memory is not known yet or no GPU is available.
|
1303
1318
|
gpu_mem = None
|
@@ -1423,6 +1438,11 @@ def get_device(device_id: Optional[int] = None) -> str:
|
|
1423
1438
|
return "xpu"
|
1424
1439
|
return "xpu:{}".format(device_id)
|
1425
1440
|
|
1441
|
+
if hasattr(torch, "npu") and torch.npu.is_available():
|
1442
|
+
if device_id == None:
|
1443
|
+
return "npu"
|
1444
|
+
return "npu:{}".format(device_id)
|
1445
|
+
|
1426
1446
|
if is_habana_available():
|
1427
1447
|
try:
|
1428
1448
|
import habana_frameworks.torch.hpu
|
@@ -1436,6 +1456,15 @@ def get_device(device_id: Optional[int] = None) -> str:
|
|
1436
1456
|
"Habana frameworks detected, but failed to import 'habana_frameworks.torch.hpu'."
|
1437
1457
|
)
|
1438
1458
|
|
1459
|
+
if is_cpu():
|
1460
|
+
if cpu_has_amx_support():
|
1461
|
+
logger.info("Intel AMX is detected, using CPU with Intel AMX support.")
|
1462
|
+
else:
|
1463
|
+
logger.warning(
|
1464
|
+
"CPU device enabled, using torch native backend, low performance expected."
|
1465
|
+
)
|
1466
|
+
return "cpu"
|
1467
|
+
|
1439
1468
|
raise RuntimeError("No accelerator (CUDA, XPU, HPU) is available.")
|
1440
1469
|
|
1441
1470
|
|
@@ -1497,15 +1526,35 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
|
|
1497
1526
|
return major, minor
|
1498
1527
|
|
1499
1528
|
|
1529
|
+
def get_npu_compiler_config():
|
1530
|
+
config = {
|
1531
|
+
"frozen_parameter": True,
|
1532
|
+
"tiling_schedule_optimize": True,
|
1533
|
+
"topology_sorting_strategy": "StableRDFS",
|
1534
|
+
}
|
1535
|
+
return config
|
1536
|
+
|
1537
|
+
|
1500
1538
|
def get_compiler_backend() -> str:
|
1501
1539
|
if hasattr(torch, "hpu") and torch.hpu.is_available():
|
1502
1540
|
return "hpu_backend"
|
1503
1541
|
|
1504
1542
|
if hasattr(torch, "npu") and torch.npu.is_available():
|
1505
|
-
|
1543
|
+
try:
|
1544
|
+
import torchair
|
1545
|
+
import torchair.ge_concrete_graph.ge_converter.experimental.patch_for_hcom_allreduce
|
1546
|
+
from torchair.configs.compiler_config import CompilerConfig
|
1547
|
+
except ImportError as e:
|
1548
|
+
raise ImportError(
|
1549
|
+
"NPU detected, but torchair package is not installed. "
|
1550
|
+
"Please install torchair for torch.compile support on NPU."
|
1551
|
+
)
|
1552
|
+
compiler_config = CompilerConfig()
|
1553
|
+
predefined_config = get_npu_compiler_config()
|
1554
|
+
for k, v in predefined_config.items():
|
1555
|
+
setattr(compiler_config.experimental_config, k, v)
|
1506
1556
|
|
1507
|
-
|
1508
|
-
npu_backend = torchair.get_npu_backend(compiler_config=config)
|
1557
|
+
npu_backend = torchair.get_npu_backend(compiler_config=compiler_config)
|
1509
1558
|
return npu_backend
|
1510
1559
|
|
1511
1560
|
return "inductor"
|
@@ -1868,13 +1917,6 @@ def configure_ipv6(dist_init_addr):
|
|
1868
1917
|
return port, host
|
1869
1918
|
|
1870
1919
|
|
1871
|
-
def rank0_log(msg: str):
|
1872
|
-
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1873
|
-
|
1874
|
-
if get_tensor_model_parallel_rank() == 0:
|
1875
|
-
logger.info(msg)
|
1876
|
-
|
1877
|
-
|
1878
1920
|
def rank0_print(msg: str):
|
1879
1921
|
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1880
1922
|
|
@@ -1882,6 +1924,9 @@ def rank0_print(msg: str):
|
|
1882
1924
|
print(msg, flush=True)
|
1883
1925
|
|
1884
1926
|
|
1927
|
+
rank0_log = rank0_print
|
1928
|
+
|
1929
|
+
|
1885
1930
|
def get_cuda_version():
|
1886
1931
|
if torch.version.cuda:
|
1887
1932
|
return tuple(map(int, torch.version.cuda.split(".")))
|
@@ -2105,6 +2150,44 @@ def get_free_port():
|
|
2105
2150
|
return s.getsockname()[1]
|
2106
2151
|
|
2107
2152
|
|
2153
|
+
def get_local_ip_auto() -> str:
|
2154
|
+
interface = os.environ.get("SGLANG_LOCAL_IP_NIC", None)
|
2155
|
+
return (
|
2156
|
+
get_local_ip_by_nic(interface)
|
2157
|
+
if interface is not None
|
2158
|
+
else get_local_ip_by_remote()
|
2159
|
+
)
|
2160
|
+
|
2161
|
+
|
2162
|
+
def get_local_ip_by_nic(interface: str) -> str:
|
2163
|
+
try:
|
2164
|
+
import netifaces
|
2165
|
+
except ImportError as e:
|
2166
|
+
raise ImportError(
|
2167
|
+
"Environment variable SGLANG_LOCAL_IP_NIC requires package netifaces, please install it through 'pip install netifaces'"
|
2168
|
+
) from e
|
2169
|
+
|
2170
|
+
try:
|
2171
|
+
addresses = netifaces.ifaddresses(interface)
|
2172
|
+
if netifaces.AF_INET in addresses:
|
2173
|
+
for addr_info in addresses[netifaces.AF_INET]:
|
2174
|
+
ip = addr_info.get("addr")
|
2175
|
+
if ip and ip != "127.0.0.1" and ip != "0.0.0.0":
|
2176
|
+
return ip
|
2177
|
+
if netifaces.AF_INET6 in addresses:
|
2178
|
+
for addr_info in addresses[netifaces.AF_INET6]:
|
2179
|
+
ip = addr_info.get("addr")
|
2180
|
+
if ip and not ip.startswith("fe80::") and ip != "::1":
|
2181
|
+
return ip.split("%")[0]
|
2182
|
+
except (ValueError, OSError) as e:
|
2183
|
+
raise ValueError(
|
2184
|
+
"Can not get local ip from NIC. Please verify whether SGLANG_LOCAL_IP_NIC is set correctly."
|
2185
|
+
)
|
2186
|
+
|
2187
|
+
# Fallback
|
2188
|
+
return get_local_ip_by_remote()
|
2189
|
+
|
2190
|
+
|
2108
2191
|
def get_local_ip_by_remote() -> str:
|
2109
2192
|
# try ipv4
|
2110
2193
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
@@ -2216,6 +2299,51 @@ class Withable(Generic[T]):
|
|
2216
2299
|
self._value = None
|
2217
2300
|
|
2218
2301
|
|
2302
|
+
def require_mlp_tp_gather(server_args):
|
2303
|
+
"""
|
2304
|
+
Check if the input of MLP is obtained by all-gather rather than all-reduce. This only happens when each MLP TP group contains multiple attention DP groups.
|
2305
|
+
"""
|
2306
|
+
if server_args.enable_dp_attention:
|
2307
|
+
assert server_args.dp_size > 1, "dp_size must be greater than 1"
|
2308
|
+
if (
|
2309
|
+
server_args.moe_dense_tp_size is None
|
2310
|
+
): # TODO(ch-wan): some MoE models do not have dense layers
|
2311
|
+
return True
|
2312
|
+
elif not server_args.enable_dp_lm_head:
|
2313
|
+
return True
|
2314
|
+
elif not server_args.enable_deepep_moe:
|
2315
|
+
return True
|
2316
|
+
else:
|
2317
|
+
return (
|
2318
|
+
server_args.moe_dense_tp_size
|
2319
|
+
> server_args.tp_size // server_args.dp_size
|
2320
|
+
)
|
2321
|
+
else:
|
2322
|
+
return False
|
2323
|
+
|
2324
|
+
|
2325
|
+
def require_attn_tp_gather(server_args):
|
2326
|
+
"""
|
2327
|
+
Check if the input of attention is scattered.
|
2328
|
+
"""
|
2329
|
+
assert server_args.moe_dense_tp_size in [1, None]
|
2330
|
+
if server_args.enable_deepep_moe or server_args.moe_dense_tp_size == 1:
|
2331
|
+
if server_args.enable_dp_attention:
|
2332
|
+
return server_args.dp_size < server_args.tp_size
|
2333
|
+
else:
|
2334
|
+
return True
|
2335
|
+
else:
|
2336
|
+
return False
|
2337
|
+
|
2338
|
+
|
2339
|
+
def require_gathered_buffer(server_args):
|
2340
|
+
return require_mlp_tp_gather(server_args) or require_attn_tp_gather(server_args)
|
2341
|
+
|
2342
|
+
|
2343
|
+
def require_mlp_sync(server_args):
|
2344
|
+
return server_args.enable_dp_attention or require_gathered_buffer(server_args)
|
2345
|
+
|
2346
|
+
|
2219
2347
|
def merge_bias_tensor(
|
2220
2348
|
lhs: Optional[torch.Tensor],
|
2221
2349
|
rhs: Optional[torch.Tensor],
|
@@ -2329,6 +2457,77 @@ def cpu_has_amx_support():
|
|
2329
2457
|
return torch._C._cpu._is_amx_tile_supported() and is_intel_amx_backend_available
|
2330
2458
|
|
2331
2459
|
|
2460
|
+
def prepack_weight_if_needed(weight):
|
2461
|
+
if weight.device != torch.device("cpu"):
|
2462
|
+
return weight
|
2463
|
+
if not cpu_has_amx_support():
|
2464
|
+
return weight
|
2465
|
+
|
2466
|
+
return torch.ops.sgl_kernel.convert_weight_packed(weight)
|
2467
|
+
|
2468
|
+
|
2469
|
+
# TODO: currently gemm kernel has the below requirements:
|
2470
|
+
# OC % TILE_N == 0, where TILE_N = 16
|
2471
|
+
# IC % TILE_K == 0, where TILE_K = 32
|
2472
|
+
def dim_is_supported(weight):
|
2473
|
+
return weight.size(0) % 16 == 0 and weight.size(1) % 32 == 0
|
2474
|
+
|
2475
|
+
|
2476
|
+
def _process_weight_after_loading(module, weight_names, transpose_dims=None) -> None:
|
2477
|
+
# Pack weight for get better performance on CPU
|
2478
|
+
devices = {getattr(module, weight_name).device for weight_name in weight_names}
|
2479
|
+
assert len(devices) == 1, f"Expects all weights to be on the same device"
|
2480
|
+
device = devices.pop()
|
2481
|
+
|
2482
|
+
if transpose_dims:
|
2483
|
+
assert len(weight_names) == len(
|
2484
|
+
transpose_dims
|
2485
|
+
), "len(weight_names) should be equal to len(transpose_dims)"
|
2486
|
+
|
2487
|
+
for i, weight_name in enumerate(weight_names):
|
2488
|
+
weight_tensor = getattr(module, weight_name)
|
2489
|
+
|
2490
|
+
# We don't pack weight or use intel amx backend if any weight of this module has unsupported dim.
|
2491
|
+
if not dim_is_supported(weight_tensor):
|
2492
|
+
logger.warning(
|
2493
|
+
f"Expects weight.size(0) % 16 == 0 and weight.size(1) % 32 == 0 "
|
2494
|
+
f"but {weight_tensor.size(0)=} and {weight_tensor.size(1)=} in {module}. "
|
2495
|
+
f"{module} won't use intel amx backend."
|
2496
|
+
)
|
2497
|
+
module.use_intel_amx_backend = False
|
2498
|
+
return
|
2499
|
+
|
2500
|
+
if transpose_dims and transpose_dims[i]:
|
2501
|
+
weight_tensor = weight_tensor.transpose(*transpose_dims[i])
|
2502
|
+
|
2503
|
+
packed_weight = torch.nn.Parameter(
|
2504
|
+
prepack_weight_if_needed(weight_tensor),
|
2505
|
+
requires_grad=False,
|
2506
|
+
)
|
2507
|
+
packed_weight.__dict__ = weight_tensor.__dict__
|
2508
|
+
setattr(module, weight_name, packed_weight)
|
2509
|
+
|
2510
|
+
module.use_intel_amx_backend = (
|
2511
|
+
device == torch.device("cpu") and cpu_has_amx_support()
|
2512
|
+
)
|
2513
|
+
|
2514
|
+
if (
|
2515
|
+
module.use_intel_amx_backend
|
2516
|
+
and hasattr(module, "bias")
|
2517
|
+
and module.bias is not None
|
2518
|
+
):
|
2519
|
+
module.bias = torch.nn.Parameter(module.bias.data.float(), requires_grad=False)
|
2520
|
+
|
2521
|
+
|
2522
|
+
class PackWeightMethod:
|
2523
|
+
def __init__(self, weight_names, transpose_dims=None):
|
2524
|
+
self.weight_names = weight_names
|
2525
|
+
self.transpose_dims = transpose_dims
|
2526
|
+
|
2527
|
+
def process_weights_after_loading(self, module) -> None:
|
2528
|
+
_process_weight_after_loading(module, self.weight_names, self.transpose_dims)
|
2529
|
+
|
2530
|
+
|
2332
2531
|
class LazyValue:
|
2333
2532
|
def __init__(self, creator: Callable):
|
2334
2533
|
self._creator = creator
|
@@ -2340,3 +2539,41 @@ class LazyValue:
|
|
2340
2539
|
self._value = self._creator()
|
2341
2540
|
self._creator = None
|
2342
2541
|
return self._value
|
2542
|
+
|
2543
|
+
|
2544
|
+
def dynamic_import(func_path: str):
|
2545
|
+
parts = func_path.split(".")
|
2546
|
+
if len(parts) < 2:
|
2547
|
+
raise ValueError(
|
2548
|
+
"func_path should contain both module name and func name (such as 'module.func')"
|
2549
|
+
)
|
2550
|
+
module_path = ".".join(parts[:-1])
|
2551
|
+
func_name = parts[-1]
|
2552
|
+
module = importlib.import_module(module_path)
|
2553
|
+
func = getattr(module, func_name)
|
2554
|
+
return func
|
2555
|
+
|
2556
|
+
|
2557
|
+
def configure_gc_logger():
|
2558
|
+
logger.info("Enable GC Logger")
|
2559
|
+
|
2560
|
+
import gc
|
2561
|
+
|
2562
|
+
gc_start_time = {}
|
2563
|
+
|
2564
|
+
def gc_callback(phase, info):
|
2565
|
+
gen = info.get("generation", "?")
|
2566
|
+
if phase == "start":
|
2567
|
+
gc_start_time[gen] = time.time()
|
2568
|
+
logger.info(f"GC start: Time {time.time()} | Generation {gen}")
|
2569
|
+
elif phase == "stop":
|
2570
|
+
duration = time.time() - gc_start_time.get(gen, time.time())
|
2571
|
+
collected = info.get("collected", "?")
|
2572
|
+
uncollectable = info.get("uncollectable", "?")
|
2573
|
+
logger.info(
|
2574
|
+
f"GC end: Time {time.time()} | Generation {gen} | "
|
2575
|
+
f"Duration: {duration:.4f}s | Collected: {collected} | Uncollectable: {uncollectable} "
|
2576
|
+
f'{"(LONG GC)" if duration > 0.1 else ""}'
|
2577
|
+
)
|
2578
|
+
|
2579
|
+
gc.callbacks.append(gc_callback)
|
sglang/test/test_block_fp8_ep.py
CHANGED
sglang/test/test_utils.py
CHANGED
@@ -37,6 +37,7 @@ from sglang.utils import get_exception_traceback
|
|
37
37
|
# General test models
|
38
38
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
39
39
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
40
|
+
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
|
40
41
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
41
42
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
42
43
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.8.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.8.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
|
|
230
230
|
Requires-Dist: msgspec; extra == "runtime-common"
|
231
231
|
Requires-Dist: ninja; extra == "runtime-common"
|
232
232
|
Requires-Dist: orjson; extra == "runtime-common"
|
233
|
+
Requires-Dist: outlines==0.1.11; extra == "runtime-common"
|
233
234
|
Requires-Dist: packaging; extra == "runtime-common"
|
234
235
|
Requires-Dist: partial_json_parser; extra == "runtime-common"
|
235
236
|
Requires-Dist: pillow; extra == "runtime-common"
|
@@ -249,13 +250,12 @@ Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
|
|
249
250
|
Provides-Extra: srt
|
250
251
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
251
252
|
Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
|
252
|
-
Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
|
253
253
|
Requires-Dist: torch==2.7.1; extra == "srt"
|
254
254
|
Requires-Dist: torchaudio==2.7.1; extra == "srt"
|
255
255
|
Requires-Dist: torchvision==0.22.1; extra == "srt"
|
256
256
|
Requires-Dist: cuda-python; extra == "srt"
|
257
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
258
257
|
Requires-Dist: einops; extra == "srt"
|
258
|
+
Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
|
259
259
|
Provides-Extra: blackwell
|
260
260
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
261
261
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -263,27 +263,21 @@ Requires-Dist: torch==2.7.1; extra == "blackwell"
|
|
263
263
|
Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
|
264
264
|
Requires-Dist: torchvision==0.22.1; extra == "blackwell"
|
265
265
|
Requires-Dist: cuda-python; extra == "blackwell"
|
266
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
|
267
266
|
Requires-Dist: einops; extra == "blackwell"
|
268
267
|
Requires-Dist: flashinfer_python==0.2.6.post1; extra == "blackwell"
|
269
268
|
Provides-Extra: srt-hip
|
270
269
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
271
270
|
Requires-Dist: torch; extra == "srt-hip"
|
272
271
|
Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
|
273
|
-
Requires-Dist: outlines==0.1.11; extra == "srt-hip"
|
274
272
|
Provides-Extra: srt-xpu
|
275
273
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
276
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-xpu"
|
277
274
|
Provides-Extra: srt-hpu
|
278
275
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
279
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-hpu"
|
280
276
|
Provides-Extra: srt-cpu
|
281
277
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
282
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
|
283
278
|
Requires-Dist: einops; extra == "srt-cpu"
|
284
279
|
Provides-Extra: srt-npu
|
285
280
|
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
286
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
|
287
281
|
Provides-Extra: openai
|
288
282
|
Requires-Dist: openai>=1.0; extra == "openai"
|
289
283
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -292,7 +286,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
|
292
286
|
Provides-Extra: litellm
|
293
287
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
294
288
|
Provides-Extra: torch-memory-saver
|
295
|
-
Requires-Dist: torch_memory_saver>=0.0.
|
289
|
+
Requires-Dist: torch_memory_saver>=0.0.8; extra == "torch-memory-saver"
|
296
290
|
Provides-Extra: decord
|
297
291
|
Requires-Dist: decord; extra == "decord"
|
298
292
|
Provides-Extra: test
|