sglang 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. sglang/bench_one_batch.py +8 -6
  2. sglang/srt/_custom_ops.py +2 -2
  3. sglang/srt/code_completion_parser.py +2 -44
  4. sglang/srt/configs/model_config.py +1 -0
  5. sglang/srt/constants.py +3 -0
  6. sglang/srt/conversation.py +14 -3
  7. sglang/srt/custom_op.py +11 -1
  8. sglang/srt/disaggregation/base/conn.py +2 -0
  9. sglang/srt/disaggregation/decode.py +22 -28
  10. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
  11. sglang/srt/disaggregation/mini_lb.py +34 -4
  12. sglang/srt/disaggregation/mooncake/conn.py +301 -64
  13. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
  14. sglang/srt/disaggregation/nixl/conn.py +94 -46
  15. sglang/srt/disaggregation/prefill.py +20 -15
  16. sglang/srt/disaggregation/utils.py +47 -18
  17. sglang/srt/distributed/parallel_state.py +12 -4
  18. sglang/srt/entrypoints/engine.py +27 -31
  19. sglang/srt/entrypoints/http_server.py +149 -79
  20. sglang/srt/entrypoints/http_server_engine.py +0 -3
  21. sglang/srt/entrypoints/openai/__init__.py +0 -0
  22. sglang/srt/{openai_api → entrypoints/openai}/protocol.py +115 -34
  23. sglang/srt/entrypoints/openai/serving_base.py +149 -0
  24. sglang/srt/entrypoints/openai/serving_chat.py +897 -0
  25. sglang/srt/entrypoints/openai/serving_completions.py +425 -0
  26. sglang/srt/entrypoints/openai/serving_embedding.py +170 -0
  27. sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
  28. sglang/srt/entrypoints/openai/serving_score.py +61 -0
  29. sglang/srt/entrypoints/openai/usage_processor.py +81 -0
  30. sglang/srt/entrypoints/openai/utils.py +72 -0
  31. sglang/srt/function_call/base_format_detector.py +7 -4
  32. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  33. sglang/srt/function_call/ebnf_composer.py +64 -10
  34. sglang/srt/function_call/function_call_parser.py +6 -6
  35. sglang/srt/function_call/llama32_detector.py +1 -1
  36. sglang/srt/function_call/mistral_detector.py +1 -1
  37. sglang/srt/function_call/pythonic_detector.py +1 -1
  38. sglang/srt/function_call/qwen25_detector.py +1 -1
  39. sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
  40. sglang/srt/layers/activation.py +28 -3
  41. sglang/srt/layers/attention/aiter_backend.py +5 -2
  42. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  43. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
  44. sglang/srt/layers/attention/flashattention_backend.py +43 -23
  45. sglang/srt/layers/attention/flashinfer_backend.py +9 -6
  46. sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
  47. sglang/srt/layers/attention/flashmla_backend.py +5 -2
  48. sglang/srt/layers/attention/tbo_backend.py +3 -3
  49. sglang/srt/layers/attention/triton_backend.py +19 -11
  50. sglang/srt/layers/communicator.py +5 -5
  51. sglang/srt/layers/dp_attention.py +11 -2
  52. sglang/srt/layers/layernorm.py +44 -2
  53. sglang/srt/layers/linear.py +18 -1
  54. sglang/srt/layers/logits_processor.py +14 -5
  55. sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
  56. sglang/srt/layers/moe/ep_moe/layer.py +286 -13
  57. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
  58. sglang/srt/layers/moe/fused_moe_native.py +7 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -2
  61. sglang/srt/layers/moe/fused_moe_triton/layer.py +148 -26
  62. sglang/srt/layers/moe/topk.py +117 -4
  63. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
  64. sglang/srt/layers/quantization/fp8.py +25 -17
  65. sglang/srt/layers/quantization/fp8_utils.py +5 -4
  66. sglang/srt/layers/quantization/modelopt_quant.py +62 -8
  67. sglang/srt/layers/quantization/utils.py +5 -2
  68. sglang/srt/layers/rotary_embedding.py +144 -12
  69. sglang/srt/layers/sampler.py +1 -1
  70. sglang/srt/layers/vocab_parallel_embedding.py +14 -1
  71. sglang/srt/lora/lora_manager.py +173 -74
  72. sglang/srt/lora/mem_pool.py +49 -45
  73. sglang/srt/lora/utils.py +1 -1
  74. sglang/srt/managers/cache_controller.py +33 -15
  75. sglang/srt/managers/expert_distribution.py +21 -0
  76. sglang/srt/managers/io_struct.py +19 -14
  77. sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
  78. sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
  79. sglang/srt/managers/schedule_batch.py +49 -32
  80. sglang/srt/managers/schedule_policy.py +70 -56
  81. sglang/srt/managers/scheduler.py +189 -68
  82. sglang/srt/managers/template_manager.py +226 -0
  83. sglang/srt/managers/tokenizer_manager.py +11 -8
  84. sglang/srt/managers/tp_worker.py +12 -2
  85. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  86. sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
  87. sglang/srt/mem_cache/base_prefix_cache.py +52 -8
  88. sglang/srt/mem_cache/chunk_cache.py +11 -16
  89. sglang/srt/mem_cache/hiradix_cache.py +34 -23
  90. sglang/srt/mem_cache/memory_pool.py +118 -114
  91. sglang/srt/mem_cache/radix_cache.py +20 -16
  92. sglang/srt/model_executor/cuda_graph_runner.py +77 -46
  93. sglang/srt/model_executor/forward_batch_info.py +18 -5
  94. sglang/srt/model_executor/model_runner.py +27 -8
  95. sglang/srt/model_loader/loader.py +50 -8
  96. sglang/srt/model_loader/weight_utils.py +100 -2
  97. sglang/srt/models/deepseek_nextn.py +35 -30
  98. sglang/srt/models/deepseek_v2.py +255 -30
  99. sglang/srt/models/gemma3n_audio.py +949 -0
  100. sglang/srt/models/gemma3n_causal.py +1009 -0
  101. sglang/srt/models/gemma3n_mm.py +511 -0
  102. sglang/srt/models/glm4.py +312 -0
  103. sglang/srt/models/hunyuan.py +771 -0
  104. sglang/srt/models/mimo_mtp.py +2 -18
  105. sglang/srt/reasoning_parser.py +21 -11
  106. sglang/srt/server_args.py +51 -9
  107. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
  108. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
  109. sglang/srt/speculative/eagle_utils.py +80 -8
  110. sglang/srt/speculative/eagle_worker.py +124 -41
  111. sglang/srt/torch_memory_saver_adapter.py +19 -15
  112. sglang/srt/two_batch_overlap.py +4 -1
  113. sglang/srt/utils.py +248 -11
  114. sglang/test/test_block_fp8_ep.py +1 -0
  115. sglang/test/test_utils.py +1 -0
  116. sglang/version.py +1 -1
  117. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +4 -10
  118. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +121 -105
  119. sglang/srt/entrypoints/verl_engine.py +0 -179
  120. sglang/srt/openai_api/adapter.py +0 -2148
  121. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
  122. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
  123. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -160,7 +160,7 @@ def is_npu() -> bool:
160
160
  return hasattr(torch, "npu") and torch.npu.is_available()
161
161
 
162
162
 
163
- def is_cpu() -> bool:
163
+ def is_host_cpu_x86() -> bool:
164
164
  machine = platform.machine().lower()
165
165
  return (
166
166
  machine in ("x86_64", "amd64", "i386", "i686")
@@ -169,6 +169,10 @@ def is_cpu() -> bool:
169
169
  )
170
170
 
171
171
 
172
+ def is_cpu() -> bool:
173
+ return os.getenv("SGLANG_USE_CPU_ENGINE", "0") == "1" and is_host_cpu_x86()
174
+
175
+
172
176
  def is_flashinfer_available():
173
177
  """
174
178
  Check whether flashinfer is available.
@@ -1291,6 +1295,15 @@ def get_hpu_memory_capacity():
1291
1295
  )
1292
1296
 
1293
1297
 
1298
+ def get_npu_memory_capacity():
1299
+ try:
1300
+ import torch_npu
1301
+
1302
+ return torch.npu.mem_get_info()[1] // 1024 // 1024 # unit: MB
1303
+ except ImportError as e:
1304
+ raise ImportError("torch_npu is required when run on npu device.")
1305
+
1306
+
1294
1307
  def get_device_memory_capacity(device: str = None):
1295
1308
  if is_cuda():
1296
1309
  gpu_mem = get_nvgpu_memory_capacity()
@@ -1298,6 +1311,8 @@ def get_device_memory_capacity(device: str = None):
1298
1311
  gpu_mem = get_amdgpu_memory_capacity()
1299
1312
  elif device == "hpu":
1300
1313
  gpu_mem = get_hpu_memory_capacity()
1314
+ elif device == "npu":
1315
+ gpu_mem = get_npu_memory_capacity()
1301
1316
  else:
1302
1317
  # GPU memory is not known yet or no GPU is available.
1303
1318
  gpu_mem = None
@@ -1423,6 +1438,11 @@ def get_device(device_id: Optional[int] = None) -> str:
1423
1438
  return "xpu"
1424
1439
  return "xpu:{}".format(device_id)
1425
1440
 
1441
+ if hasattr(torch, "npu") and torch.npu.is_available():
1442
+ if device_id == None:
1443
+ return "npu"
1444
+ return "npu:{}".format(device_id)
1445
+
1426
1446
  if is_habana_available():
1427
1447
  try:
1428
1448
  import habana_frameworks.torch.hpu
@@ -1436,6 +1456,15 @@ def get_device(device_id: Optional[int] = None) -> str:
1436
1456
  "Habana frameworks detected, but failed to import 'habana_frameworks.torch.hpu'."
1437
1457
  )
1438
1458
 
1459
+ if is_cpu():
1460
+ if cpu_has_amx_support():
1461
+ logger.info("Intel AMX is detected, using CPU with Intel AMX support.")
1462
+ else:
1463
+ logger.warning(
1464
+ "CPU device enabled, using torch native backend, low performance expected."
1465
+ )
1466
+ return "cpu"
1467
+
1439
1468
  raise RuntimeError("No accelerator (CUDA, XPU, HPU) is available.")
1440
1469
 
1441
1470
 
@@ -1497,15 +1526,35 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
1497
1526
  return major, minor
1498
1527
 
1499
1528
 
1529
+ def get_npu_compiler_config():
1530
+ config = {
1531
+ "frozen_parameter": True,
1532
+ "tiling_schedule_optimize": True,
1533
+ "topology_sorting_strategy": "StableRDFS",
1534
+ }
1535
+ return config
1536
+
1537
+
1500
1538
  def get_compiler_backend() -> str:
1501
1539
  if hasattr(torch, "hpu") and torch.hpu.is_available():
1502
1540
  return "hpu_backend"
1503
1541
 
1504
1542
  if hasattr(torch, "npu") and torch.npu.is_available():
1505
- import torchair
1543
+ try:
1544
+ import torchair
1545
+ import torchair.ge_concrete_graph.ge_converter.experimental.patch_for_hcom_allreduce
1546
+ from torchair.configs.compiler_config import CompilerConfig
1547
+ except ImportError as e:
1548
+ raise ImportError(
1549
+ "NPU detected, but torchair package is not installed. "
1550
+ "Please install torchair for torch.compile support on NPU."
1551
+ )
1552
+ compiler_config = CompilerConfig()
1553
+ predefined_config = get_npu_compiler_config()
1554
+ for k, v in predefined_config.items():
1555
+ setattr(compiler_config.experimental_config, k, v)
1506
1556
 
1507
- config = torchair.CompilerConfig()
1508
- npu_backend = torchair.get_npu_backend(compiler_config=config)
1557
+ npu_backend = torchair.get_npu_backend(compiler_config=compiler_config)
1509
1558
  return npu_backend
1510
1559
 
1511
1560
  return "inductor"
@@ -1868,13 +1917,6 @@ def configure_ipv6(dist_init_addr):
1868
1917
  return port, host
1869
1918
 
1870
1919
 
1871
- def rank0_log(msg: str):
1872
- from sglang.srt.distributed import get_tensor_model_parallel_rank
1873
-
1874
- if get_tensor_model_parallel_rank() == 0:
1875
- logger.info(msg)
1876
-
1877
-
1878
1920
  def rank0_print(msg: str):
1879
1921
  from sglang.srt.distributed import get_tensor_model_parallel_rank
1880
1922
 
@@ -1882,6 +1924,9 @@ def rank0_print(msg: str):
1882
1924
  print(msg, flush=True)
1883
1925
 
1884
1926
 
1927
+ rank0_log = rank0_print
1928
+
1929
+
1885
1930
  def get_cuda_version():
1886
1931
  if torch.version.cuda:
1887
1932
  return tuple(map(int, torch.version.cuda.split(".")))
@@ -2105,6 +2150,44 @@ def get_free_port():
2105
2150
  return s.getsockname()[1]
2106
2151
 
2107
2152
 
2153
+ def get_local_ip_auto() -> str:
2154
+ interface = os.environ.get("SGLANG_LOCAL_IP_NIC", None)
2155
+ return (
2156
+ get_local_ip_by_nic(interface)
2157
+ if interface is not None
2158
+ else get_local_ip_by_remote()
2159
+ )
2160
+
2161
+
2162
+ def get_local_ip_by_nic(interface: str) -> str:
2163
+ try:
2164
+ import netifaces
2165
+ except ImportError as e:
2166
+ raise ImportError(
2167
+ "Environment variable SGLANG_LOCAL_IP_NIC requires package netifaces, please install it through 'pip install netifaces'"
2168
+ ) from e
2169
+
2170
+ try:
2171
+ addresses = netifaces.ifaddresses(interface)
2172
+ if netifaces.AF_INET in addresses:
2173
+ for addr_info in addresses[netifaces.AF_INET]:
2174
+ ip = addr_info.get("addr")
2175
+ if ip and ip != "127.0.0.1" and ip != "0.0.0.0":
2176
+ return ip
2177
+ if netifaces.AF_INET6 in addresses:
2178
+ for addr_info in addresses[netifaces.AF_INET6]:
2179
+ ip = addr_info.get("addr")
2180
+ if ip and not ip.startswith("fe80::") and ip != "::1":
2181
+ return ip.split("%")[0]
2182
+ except (ValueError, OSError) as e:
2183
+ raise ValueError(
2184
+ "Can not get local ip from NIC. Please verify whether SGLANG_LOCAL_IP_NIC is set correctly."
2185
+ )
2186
+
2187
+ # Fallback
2188
+ return get_local_ip_by_remote()
2189
+
2190
+
2108
2191
  def get_local_ip_by_remote() -> str:
2109
2192
  # try ipv4
2110
2193
  s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
@@ -2216,6 +2299,51 @@ class Withable(Generic[T]):
2216
2299
  self._value = None
2217
2300
 
2218
2301
 
2302
+ def require_mlp_tp_gather(server_args):
2303
+ """
2304
+ Check if the input of MLP is obtained by all-gather rather than all-reduce. This only happens when each MLP TP group contains multiple attention DP groups.
2305
+ """
2306
+ if server_args.enable_dp_attention:
2307
+ assert server_args.dp_size > 1, "dp_size must be greater than 1"
2308
+ if (
2309
+ server_args.moe_dense_tp_size is None
2310
+ ): # TODO(ch-wan): some MoE models do not have dense layers
2311
+ return True
2312
+ elif not server_args.enable_dp_lm_head:
2313
+ return True
2314
+ elif not server_args.enable_deepep_moe:
2315
+ return True
2316
+ else:
2317
+ return (
2318
+ server_args.moe_dense_tp_size
2319
+ > server_args.tp_size // server_args.dp_size
2320
+ )
2321
+ else:
2322
+ return False
2323
+
2324
+
2325
+ def require_attn_tp_gather(server_args):
2326
+ """
2327
+ Check if the input of attention is scattered.
2328
+ """
2329
+ assert server_args.moe_dense_tp_size in [1, None]
2330
+ if server_args.enable_deepep_moe or server_args.moe_dense_tp_size == 1:
2331
+ if server_args.enable_dp_attention:
2332
+ return server_args.dp_size < server_args.tp_size
2333
+ else:
2334
+ return True
2335
+ else:
2336
+ return False
2337
+
2338
+
2339
+ def require_gathered_buffer(server_args):
2340
+ return require_mlp_tp_gather(server_args) or require_attn_tp_gather(server_args)
2341
+
2342
+
2343
+ def require_mlp_sync(server_args):
2344
+ return server_args.enable_dp_attention or require_gathered_buffer(server_args)
2345
+
2346
+
2219
2347
  def merge_bias_tensor(
2220
2348
  lhs: Optional[torch.Tensor],
2221
2349
  rhs: Optional[torch.Tensor],
@@ -2329,6 +2457,77 @@ def cpu_has_amx_support():
2329
2457
  return torch._C._cpu._is_amx_tile_supported() and is_intel_amx_backend_available
2330
2458
 
2331
2459
 
2460
+ def prepack_weight_if_needed(weight):
2461
+ if weight.device != torch.device("cpu"):
2462
+ return weight
2463
+ if not cpu_has_amx_support():
2464
+ return weight
2465
+
2466
+ return torch.ops.sgl_kernel.convert_weight_packed(weight)
2467
+
2468
+
2469
+ # TODO: currently gemm kernel has the below requirements:
2470
+ # OC % TILE_N == 0, where TILE_N = 16
2471
+ # IC % TILE_K == 0, where TILE_K = 32
2472
+ def dim_is_supported(weight):
2473
+ return weight.size(0) % 16 == 0 and weight.size(1) % 32 == 0
2474
+
2475
+
2476
+ def _process_weight_after_loading(module, weight_names, transpose_dims=None) -> None:
2477
+ # Pack weight for get better performance on CPU
2478
+ devices = {getattr(module, weight_name).device for weight_name in weight_names}
2479
+ assert len(devices) == 1, f"Expects all weights to be on the same device"
2480
+ device = devices.pop()
2481
+
2482
+ if transpose_dims:
2483
+ assert len(weight_names) == len(
2484
+ transpose_dims
2485
+ ), "len(weight_names) should be equal to len(transpose_dims)"
2486
+
2487
+ for i, weight_name in enumerate(weight_names):
2488
+ weight_tensor = getattr(module, weight_name)
2489
+
2490
+ # We don't pack weight or use intel amx backend if any weight of this module has unsupported dim.
2491
+ if not dim_is_supported(weight_tensor):
2492
+ logger.warning(
2493
+ f"Expects weight.size(0) % 16 == 0 and weight.size(1) % 32 == 0 "
2494
+ f"but {weight_tensor.size(0)=} and {weight_tensor.size(1)=} in {module}. "
2495
+ f"{module} won't use intel amx backend."
2496
+ )
2497
+ module.use_intel_amx_backend = False
2498
+ return
2499
+
2500
+ if transpose_dims and transpose_dims[i]:
2501
+ weight_tensor = weight_tensor.transpose(*transpose_dims[i])
2502
+
2503
+ packed_weight = torch.nn.Parameter(
2504
+ prepack_weight_if_needed(weight_tensor),
2505
+ requires_grad=False,
2506
+ )
2507
+ packed_weight.__dict__ = weight_tensor.__dict__
2508
+ setattr(module, weight_name, packed_weight)
2509
+
2510
+ module.use_intel_amx_backend = (
2511
+ device == torch.device("cpu") and cpu_has_amx_support()
2512
+ )
2513
+
2514
+ if (
2515
+ module.use_intel_amx_backend
2516
+ and hasattr(module, "bias")
2517
+ and module.bias is not None
2518
+ ):
2519
+ module.bias = torch.nn.Parameter(module.bias.data.float(), requires_grad=False)
2520
+
2521
+
2522
+ class PackWeightMethod:
2523
+ def __init__(self, weight_names, transpose_dims=None):
2524
+ self.weight_names = weight_names
2525
+ self.transpose_dims = transpose_dims
2526
+
2527
+ def process_weights_after_loading(self, module) -> None:
2528
+ _process_weight_after_loading(module, self.weight_names, self.transpose_dims)
2529
+
2530
+
2332
2531
  class LazyValue:
2333
2532
  def __init__(self, creator: Callable):
2334
2533
  self._creator = creator
@@ -2340,3 +2539,41 @@ class LazyValue:
2340
2539
  self._value = self._creator()
2341
2540
  self._creator = None
2342
2541
  return self._value
2542
+
2543
+
2544
+ def dynamic_import(func_path: str):
2545
+ parts = func_path.split(".")
2546
+ if len(parts) < 2:
2547
+ raise ValueError(
2548
+ "func_path should contain both module name and func name (such as 'module.func')"
2549
+ )
2550
+ module_path = ".".join(parts[:-1])
2551
+ func_name = parts[-1]
2552
+ module = importlib.import_module(module_path)
2553
+ func = getattr(module, func_name)
2554
+ return func
2555
+
2556
+
2557
+ def configure_gc_logger():
2558
+ logger.info("Enable GC Logger")
2559
+
2560
+ import gc
2561
+
2562
+ gc_start_time = {}
2563
+
2564
+ def gc_callback(phase, info):
2565
+ gen = info.get("generation", "?")
2566
+ if phase == "start":
2567
+ gc_start_time[gen] = time.time()
2568
+ logger.info(f"GC start: Time {time.time()} | Generation {gen}")
2569
+ elif phase == "stop":
2570
+ duration = time.time() - gc_start_time.get(gen, time.time())
2571
+ collected = info.get("collected", "?")
2572
+ uncollectable = info.get("uncollectable", "?")
2573
+ logger.info(
2574
+ f"GC end: Time {time.time()} | Generation {gen} | "
2575
+ f"Duration: {duration:.4f}s | Collected: {collected} | Uncollectable: {uncollectable} "
2576
+ f'{"(LONG GC)" if duration > 0.1 else ""}'
2577
+ )
2578
+
2579
+ gc.callbacks.append(gc_callback)
@@ -182,6 +182,7 @@ def ep_moe(
182
182
  end_expert_id,
183
183
  top_k,
184
184
  hidden_states.size(1),
185
+ 0,
185
186
  BLOCK_SIZE=512,
186
187
  )
187
188
  return output
sglang/test/test_utils.py CHANGED
@@ -37,6 +37,7 @@ from sglang.utils import get_exception_traceback
37
37
  # General test models
38
38
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
39
39
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
40
+ DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
40
41
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
41
42
  DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
42
43
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.7.post1"
1
+ __version__ = "0.4.8.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.7.post1
3
+ Version: 0.4.8.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
230
230
  Requires-Dist: msgspec; extra == "runtime-common"
231
231
  Requires-Dist: ninja; extra == "runtime-common"
232
232
  Requires-Dist: orjson; extra == "runtime-common"
233
+ Requires-Dist: outlines==0.1.11; extra == "runtime-common"
233
234
  Requires-Dist: packaging; extra == "runtime-common"
234
235
  Requires-Dist: partial_json_parser; extra == "runtime-common"
235
236
  Requires-Dist: pillow; extra == "runtime-common"
@@ -249,13 +250,12 @@ Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
249
250
  Provides-Extra: srt
250
251
  Requires-Dist: sglang[runtime_common]; extra == "srt"
251
252
  Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
252
- Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
253
253
  Requires-Dist: torch==2.7.1; extra == "srt"
254
254
  Requires-Dist: torchaudio==2.7.1; extra == "srt"
255
255
  Requires-Dist: torchvision==0.22.1; extra == "srt"
256
256
  Requires-Dist: cuda-python; extra == "srt"
257
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
258
257
  Requires-Dist: einops; extra == "srt"
258
+ Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
259
259
  Provides-Extra: blackwell
260
260
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
261
261
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -263,27 +263,21 @@ Requires-Dist: torch==2.7.1; extra == "blackwell"
263
263
  Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
264
264
  Requires-Dist: torchvision==0.22.1; extra == "blackwell"
265
265
  Requires-Dist: cuda-python; extra == "blackwell"
266
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
267
266
  Requires-Dist: einops; extra == "blackwell"
268
267
  Requires-Dist: flashinfer_python==0.2.6.post1; extra == "blackwell"
269
268
  Provides-Extra: srt-hip
270
269
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
271
270
  Requires-Dist: torch; extra == "srt-hip"
272
271
  Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
273
- Requires-Dist: outlines==0.1.11; extra == "srt-hip"
274
272
  Provides-Extra: srt-xpu
275
273
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
276
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-xpu"
277
274
  Provides-Extra: srt-hpu
278
275
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
279
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-hpu"
280
276
  Provides-Extra: srt-cpu
281
277
  Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
282
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
283
278
  Requires-Dist: einops; extra == "srt-cpu"
284
279
  Provides-Extra: srt-npu
285
280
  Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
286
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
287
281
  Provides-Extra: openai
288
282
  Requires-Dist: openai>=1.0; extra == "openai"
289
283
  Requires-Dist: tiktoken; extra == "openai"
@@ -292,7 +286,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
292
286
  Provides-Extra: litellm
293
287
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
294
288
  Provides-Extra: torch-memory-saver
295
- Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
289
+ Requires-Dist: torch_memory_saver>=0.0.8; extra == "torch-memory-saver"
296
290
  Provides-Extra: decord
297
291
  Requires-Dist: decord; extra == "decord"
298
292
  Provides-Extra: test