sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -6
- sglang/bench_one_batch_server.py +7 -2
- sglang/bench_serving.py +3 -3
- sglang/eval/llama3_eval.py +0 -1
- sglang/srt/configs/model_config.py +24 -9
- sglang/srt/configs/update_config.py +40 -5
- sglang/srt/constrained/xgrammar_backend.py +23 -11
- sglang/srt/conversation.py +2 -15
- sglang/srt/disaggregation/ascend/conn.py +1 -3
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +1 -1
- sglang/srt/disaggregation/launch_lb.py +7 -1
- sglang/srt/disaggregation/mini_lb.py +11 -5
- sglang/srt/disaggregation/mooncake/conn.py +141 -47
- sglang/srt/disaggregation/prefill.py +261 -5
- sglang/srt/disaggregation/utils.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/device_communicators/pynccl.py +68 -18
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
- sglang/srt/distributed/naive_distributed.py +112 -0
- sglang/srt/distributed/parallel_state.py +90 -4
- sglang/srt/entrypoints/context.py +20 -1
- sglang/srt/entrypoints/engine.py +27 -2
- sglang/srt/entrypoints/http_server.py +12 -0
- sglang/srt/entrypoints/openai/protocol.py +2 -2
- sglang/srt/entrypoints/openai/serving_chat.py +22 -6
- sglang/srt/entrypoints/openai/serving_completions.py +9 -1
- sglang/srt/entrypoints/openai/serving_responses.py +2 -2
- sglang/srt/eplb/expert_distribution.py +2 -3
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +24 -0
- sglang/srt/host_shared_memory.py +83 -0
- sglang/srt/layers/attention/ascend_backend.py +132 -22
- sglang/srt/layers/attention/flashattention_backend.py +24 -17
- sglang/srt/layers/attention/flashinfer_backend.py +11 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
- sglang/srt/layers/attention/triton_backend.py +85 -46
- sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
- sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
- sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
- sglang/srt/layers/attention/utils.py +94 -15
- sglang/srt/layers/attention/vision.py +40 -13
- sglang/srt/layers/attention/vision_utils.py +65 -0
- sglang/srt/layers/communicator.py +51 -3
- sglang/srt/layers/dp_attention.py +23 -4
- sglang/srt/layers/elementwise.py +94 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
- sglang/srt/layers/layernorm.py +8 -1
- sglang/srt/layers/linear.py +24 -0
- sglang/srt/layers/logits_processor.py +5 -1
- sglang/srt/layers/moe/__init__.py +31 -0
- sglang/srt/layers/moe/ep_moe/layer.py +37 -33
- sglang/srt/layers/moe/fused_moe_native.py +14 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
- sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
- sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
- sglang/srt/layers/moe/moe_runner/base.py +13 -0
- sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
- sglang/srt/layers/moe/router.py +15 -9
- sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
- sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
- sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
- sglang/srt/layers/moe/topk.py +167 -83
- sglang/srt/layers/moe/utils.py +159 -18
- sglang/srt/layers/quantization/__init__.py +13 -14
- sglang/srt/layers/quantization/awq.py +7 -7
- sglang/srt/layers/quantization/base_config.py +2 -6
- sglang/srt/layers/quantization/blockwise_int8.py +4 -12
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
- sglang/srt/layers/quantization/fp8.py +127 -119
- sglang/srt/layers/quantization/fp8_kernel.py +195 -24
- sglang/srt/layers/quantization/fp8_utils.py +34 -9
- sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
- sglang/srt/layers/quantization/gptq.py +5 -4
- sglang/srt/layers/quantization/marlin_utils.py +11 -3
- sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
- sglang/srt/layers/quantization/modelopt_quant.py +165 -68
- sglang/srt/layers/quantization/moe_wna16.py +10 -15
- sglang/srt/layers/quantization/mxfp4.py +206 -37
- sglang/srt/layers/quantization/quark/quark.py +390 -0
- sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
- sglang/srt/layers/quantization/unquant.py +34 -70
- sglang/srt/layers/quantization/utils.py +25 -0
- sglang/srt/layers/quantization/w4afp8.py +7 -8
- sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
- sglang/srt/layers/quantization/w8a8_int8.py +5 -13
- sglang/srt/layers/radix_attention.py +6 -0
- sglang/srt/layers/rotary_embedding.py +1 -0
- sglang/srt/lora/lora_manager.py +21 -22
- sglang/srt/lora/lora_registry.py +3 -3
- sglang/srt/lora/mem_pool.py +26 -24
- sglang/srt/lora/utils.py +10 -12
- sglang/srt/managers/cache_controller.py +76 -18
- sglang/srt/managers/detokenizer_manager.py +10 -2
- sglang/srt/managers/io_struct.py +9 -0
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/schedule_batch.py +4 -9
- sglang/srt/managers/scheduler.py +25 -16
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/template_manager.py +7 -5
- sglang/srt/managers/tokenizer_manager.py +60 -21
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/managers/utils.py +59 -1
- sglang/srt/mem_cache/allocator.py +7 -5
- sglang/srt/mem_cache/allocator_ascend.py +0 -11
- sglang/srt/mem_cache/hicache_storage.py +14 -4
- sglang/srt/mem_cache/memory_pool.py +3 -3
- sglang/srt/mem_cache/memory_pool_host.py +35 -2
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
- sglang/srt/model_executor/cuda_graph_runner.py +25 -12
- sglang/srt/model_executor/forward_batch_info.py +4 -1
- sglang/srt/model_executor/model_runner.py +43 -32
- sglang/srt/model_executor/npu_graph_runner.py +94 -0
- sglang/srt/model_loader/loader.py +24 -6
- sglang/srt/models/dbrx.py +12 -6
- sglang/srt/models/deepseek.py +2 -1
- sglang/srt/models/deepseek_nextn.py +3 -1
- sglang/srt/models/deepseek_v2.py +224 -223
- sglang/srt/models/ernie4.py +2 -2
- sglang/srt/models/glm4_moe.py +25 -63
- sglang/srt/models/glm4v.py +52 -1
- sglang/srt/models/glm4v_moe.py +8 -11
- sglang/srt/models/gpt_oss.py +34 -74
- sglang/srt/models/granitemoe.py +0 -1
- sglang/srt/models/grok.py +376 -48
- sglang/srt/models/interns1.py +12 -47
- sglang/srt/models/internvl.py +6 -51
- sglang/srt/models/llama4.py +0 -2
- sglang/srt/models/minicpm3.py +0 -1
- sglang/srt/models/mixtral.py +0 -2
- sglang/srt/models/nemotron_nas.py +435 -0
- sglang/srt/models/olmoe.py +0 -1
- sglang/srt/models/phi4mm.py +3 -21
- sglang/srt/models/qwen2_5_vl.py +2 -0
- sglang/srt/models/qwen2_moe.py +3 -18
- sglang/srt/models/qwen3.py +2 -2
- sglang/srt/models/qwen3_classification.py +7 -1
- sglang/srt/models/qwen3_moe.py +9 -38
- sglang/srt/models/step3_vl.py +2 -1
- sglang/srt/models/xverse_moe.py +11 -5
- sglang/srt/multimodal/processors/base_processor.py +3 -3
- sglang/srt/multimodal/processors/internvl.py +7 -2
- sglang/srt/multimodal/processors/llava.py +11 -7
- sglang/srt/offloader.py +433 -0
- sglang/srt/operations.py +6 -1
- sglang/srt/reasoning_parser.py +4 -3
- sglang/srt/server_args.py +237 -104
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
- sglang/srt/speculative/eagle_utils.py +36 -13
- sglang/srt/speculative/eagle_worker.py +56 -3
- sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
- sglang/srt/two_batch_overlap.py +16 -11
- sglang/srt/utils.py +68 -70
- sglang/test/runners.py +8 -5
- sglang/test/test_block_fp8.py +5 -6
- sglang/test/test_block_fp8_ep.py +13 -19
- sglang/test/test_cutlass_moe.py +4 -6
- sglang/test/test_cutlass_w4a8_moe.py +4 -3
- sglang/test/test_fp4_moe.py +4 -3
- sglang/test/test_utils.py +7 -0
- sglang/utils.py +0 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
- sglang/srt/layers/quantization/fp4.py +0 -557
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
sglang/test/test_block_fp8_ep.py
CHANGED
@@ -12,7 +12,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
|
|
12
12
|
run_moe_ep_preproess,
|
13
13
|
silu_and_mul_triton_kernel,
|
14
14
|
)
|
15
|
-
from sglang.srt.layers.moe.topk import select_experts
|
15
|
+
from sglang.srt.layers.moe.topk import TopKConfig, select_experts
|
16
16
|
from sglang.test.test_utils import CustomTestCase
|
17
17
|
|
18
18
|
|
@@ -22,35 +22,26 @@ def ep_moe(
|
|
22
22
|
w1: torch.Tensor,
|
23
23
|
w2: torch.Tensor,
|
24
24
|
router_logits: torch.Tensor,
|
25
|
-
|
26
|
-
renormalize: bool,
|
25
|
+
topk_config: TopKConfig,
|
27
26
|
# ep config
|
28
27
|
num_experts: int = 256,
|
29
28
|
fp8_dtype: torch.types = torch.float8_e4m3fn,
|
30
29
|
num_experts_per_partition: int = 128,
|
31
30
|
start_expert_id: int = 0,
|
32
31
|
end_expert_id: int = 127,
|
33
|
-
use_grouped_topk: bool = False,
|
34
|
-
num_expert_group: Optional[int] = None,
|
35
|
-
topk_group: Optional[int] = None,
|
36
|
-
custom_routing_function: Optional[Callable] = None,
|
37
32
|
use_fp8_w8a8: bool = False,
|
38
33
|
w1_scale_inv: Optional[torch.Tensor] = None,
|
39
34
|
w2_scale_inv: Optional[torch.Tensor] = None,
|
40
35
|
block_shape: Optional[List[int]] = None,
|
41
36
|
):
|
42
37
|
use_blockwise_fp8 = block_shape is not None
|
43
|
-
|
38
|
+
top_k = topk_config.top_k
|
39
|
+
topk_output = select_experts(
|
44
40
|
hidden_states=hidden_states,
|
45
41
|
router_logits=router_logits,
|
46
|
-
|
47
|
-
use_grouped_topk=use_grouped_topk,
|
48
|
-
renormalize=renormalize,
|
49
|
-
topk_group=topk_group,
|
50
|
-
num_expert_group=num_expert_group,
|
51
|
-
# correction_bias=correction_bias, #skip this in test
|
52
|
-
custom_routing_function=custom_routing_function,
|
42
|
+
topk_config=topk_config,
|
53
43
|
)
|
44
|
+
topk_weights, topk_ids, _ = topk_output
|
54
45
|
|
55
46
|
reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(topk_ids, num_experts)
|
56
47
|
|
@@ -294,14 +285,18 @@ class TestW8A8BlockFP8EPMoE(CustomTestCase):
|
|
294
285
|
start_id = cur_rank * num_experts_per_partition
|
295
286
|
end_id = start_id + num_experts_per_partition - 1
|
296
287
|
|
288
|
+
topk_config = TopKConfig(
|
289
|
+
top_k=topk,
|
290
|
+
renormalize=False,
|
291
|
+
)
|
292
|
+
|
297
293
|
with torch.inference_mode():
|
298
294
|
out = ep_moe(
|
299
295
|
hidden_states=a,
|
300
296
|
w1=w1,
|
301
297
|
w2=w2,
|
302
298
|
router_logits=score,
|
303
|
-
|
304
|
-
renormalize=False,
|
299
|
+
topk_config=topk_config,
|
305
300
|
use_fp8_w8a8=True,
|
306
301
|
w1_scale_inv=w1_s,
|
307
302
|
w2_scale_inv=w2_s,
|
@@ -316,8 +311,7 @@ class TestW8A8BlockFP8EPMoE(CustomTestCase):
|
|
316
311
|
w1=w1_ref,
|
317
312
|
w2=w2_ref,
|
318
313
|
router_logits=score,
|
319
|
-
|
320
|
-
renormalize=False,
|
314
|
+
topk_config=topk_config,
|
321
315
|
use_fp8_w8a8=False,
|
322
316
|
w1_scale_inv=None,
|
323
317
|
w2_scale_inv=None,
|
sglang/test/test_cutlass_moe.py
CHANGED
@@ -153,9 +153,8 @@ def run_test(tp_size, batch_size, model_config, check=False):
|
|
153
153
|
x,
|
154
154
|
w1,
|
155
155
|
w2,
|
156
|
-
topk_weights,
|
157
|
-
|
158
|
-
inplace=False, # Use False for benchmarking to avoid side effects if run multiple times
|
156
|
+
(topk_weights, topk_ids, "dummy"),
|
157
|
+
inplace=False,
|
159
158
|
activation="silu", # Assuming SiLU activation common in MoEs
|
160
159
|
use_fp8_w8a8=True,
|
161
160
|
w1_scale=w1_scale,
|
@@ -221,8 +220,7 @@ def run_test(tp_size, batch_size, model_config, check=False):
|
|
221
220
|
x,
|
222
221
|
w1, # Original shape
|
223
222
|
w2, # Original shape
|
224
|
-
topk_weights,
|
225
|
-
topk_ids,
|
223
|
+
(topk_weights, topk_ids, "dummy"),
|
226
224
|
inplace=False, # Important: Use False to get output tensor
|
227
225
|
activation="silu",
|
228
226
|
use_fp8_w8a8=True,
|
@@ -266,7 +264,7 @@ if __name__ == "__main__":
|
|
266
264
|
"--batch-sizes",
|
267
265
|
type=int,
|
268
266
|
nargs="+",
|
269
|
-
default=[1, 4, 8, 16, 32, 64, 128, 256, 512], # Adjusted default
|
267
|
+
default=[1, 4, 8, 16, 32, 64, 128, 256, 512, 1024], # Adjusted default
|
270
268
|
help="List of batch sizes to test",
|
271
269
|
)
|
272
270
|
parser.add_argument("--check", action="store_true", help="Enable check mode")
|
@@ -6,7 +6,7 @@ import pytest
|
|
6
6
|
import torch
|
7
7
|
|
8
8
|
from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
|
9
|
-
from sglang.srt.layers.moe.topk import select_experts
|
9
|
+
from sglang.srt.layers.moe.topk import TopKConfig, select_experts
|
10
10
|
|
11
11
|
|
12
12
|
def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
|
@@ -100,11 +100,12 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
|
|
100
100
|
s_strides2 = c_strides2
|
101
101
|
|
102
102
|
score = torch.randn((M, E), dtype=dtype, device=device)
|
103
|
-
|
103
|
+
topk_output = select_experts(
|
104
104
|
hidden_states=a,
|
105
105
|
router_logits=score,
|
106
|
-
top_k=topk,
|
106
|
+
topk_config=TopKConfig(top_k=topk, renormalize=False),
|
107
107
|
)
|
108
|
+
topk_weights, topk_ids, _ = topk_output
|
108
109
|
expert_map = torch.arange(E, dtype=torch.int32, device=device)
|
109
110
|
expert_map[local_e:] = E
|
110
111
|
|
sglang/test/test_fp4_moe.py
CHANGED
@@ -9,7 +9,7 @@ from sgl_kernel import scaled_fp4_quant
|
|
9
9
|
from sglang.srt.layers.activation import SiluAndMul
|
10
10
|
from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
|
11
11
|
from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
|
12
|
-
from sglang.srt.layers.moe.topk import select_experts
|
12
|
+
from sglang.srt.layers.moe.topk import TopKConfig, select_experts
|
13
13
|
|
14
14
|
if torch.cuda.get_device_capability() < (10, 0):
|
15
15
|
pytest.skip(
|
@@ -163,11 +163,12 @@ def check_moe(
|
|
163
163
|
|
164
164
|
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
165
165
|
|
166
|
-
|
166
|
+
topk_output = select_experts(
|
167
167
|
hidden_states=a,
|
168
168
|
router_logits=score,
|
169
|
-
top_k=topk,
|
169
|
+
topk_config=TopKConfig(top_k=topk, renormalize=False),
|
170
170
|
)
|
171
|
+
topk_weights, topk_ids, _ = topk_output
|
171
172
|
|
172
173
|
a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
|
173
174
|
a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
|
sglang/test/test_utils.py
CHANGED
@@ -61,6 +61,12 @@ DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
|
|
61
61
|
DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
|
62
62
|
"nvidia/Llama-3.1-8B-Instruct-FP8"
|
63
63
|
)
|
64
|
+
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8 = "Qwen/Qwen3-1.7B-FP8"
|
65
|
+
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
|
66
|
+
|
67
|
+
# W8A8 models
|
68
|
+
DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
|
69
|
+
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
64
70
|
|
65
71
|
# EAGLE
|
66
72
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
@@ -78,6 +84,7 @@ DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
|
78
84
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
79
85
|
)
|
80
86
|
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
|
87
|
+
DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST = "Barrrrry/DeepSeek-R1-W4AFP8"
|
81
88
|
|
82
89
|
# Nightly tests
|
83
90
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
sglang/utils.py
CHANGED
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -232,7 +232,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
|
|
232
232
|
Requires-Dist: msgspec; extra == "runtime-common"
|
233
233
|
Requires-Dist: ninja; extra == "runtime-common"
|
234
234
|
Requires-Dist: openai==1.99.1; extra == "runtime-common"
|
235
|
-
Requires-Dist: openai-harmony==0.0.
|
235
|
+
Requires-Dist: openai-harmony==0.0.4; extra == "runtime-common"
|
236
236
|
Requires-Dist: orjson; extra == "runtime-common"
|
237
237
|
Requires-Dist: outlines==0.1.11; extra == "runtime-common"
|
238
238
|
Requires-Dist: packaging; extra == "runtime-common"
|
@@ -240,9 +240,9 @@ Requires-Dist: partial_json_parser; extra == "runtime-common"
|
|
240
240
|
Requires-Dist: pillow; extra == "runtime-common"
|
241
241
|
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
242
242
|
Requires-Dist: psutil; extra == "runtime-common"
|
243
|
+
Requires-Dist: pybase64; extra == "runtime-common"
|
243
244
|
Requires-Dist: pydantic; extra == "runtime-common"
|
244
245
|
Requires-Dist: pynvml; extra == "runtime-common"
|
245
|
-
Requires-Dist: pybase64; extra == "runtime-common"
|
246
246
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
247
247
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
248
248
|
Requires-Dist: sentencepiece; extra == "runtime-common"
|
@@ -254,7 +254,7 @@ Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
|
254
254
|
Requires-Dist: transformers==4.55.2; extra == "runtime-common"
|
255
255
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
256
256
|
Requires-Dist: uvloop; extra == "runtime-common"
|
257
|
-
Requires-Dist: xgrammar==0.1.
|
257
|
+
Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
|
258
258
|
Provides-Extra: srt
|
259
259
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
260
260
|
Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
|
@@ -278,13 +278,12 @@ Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
|
|
278
278
|
Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
|
279
279
|
Provides-Extra: srt-cpu
|
280
280
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
281
|
-
|
281
|
+
Provides-Extra: srt-npu
|
282
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
282
283
|
Provides-Extra: srt-xpu
|
283
284
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
284
285
|
Provides-Extra: srt-hpu
|
285
286
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
286
|
-
Provides-Extra: srt-npu
|
287
|
-
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
288
287
|
Provides-Extra: openai
|
289
288
|
Requires-Dist: openai==1.99.1; extra == "openai"
|
290
289
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -375,6 +374,7 @@ Dynamic: license-file
|
|
375
374
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
376
375
|
|
377
376
|
## News
|
377
|
+
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking. [Register here](https://lu.ma/gbfhjvuo).
|
378
378
|
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
379
379
|
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
380
380
|
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|