sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. sglang/bench_one_batch.py +3 -11
  2. sglang/bench_serving.py +149 -1
  3. sglang/check_env.py +3 -3
  4. sglang/lang/chat_template.py +44 -0
  5. sglang/srt/configs/__init__.py +4 -0
  6. sglang/srt/configs/deepseekvl2.py +3 -0
  7. sglang/srt/configs/device_config.py +1 -1
  8. sglang/srt/configs/internvl.py +696 -0
  9. sglang/srt/configs/janus_pro.py +3 -0
  10. sglang/srt/configs/kimi_vl.py +38 -0
  11. sglang/srt/configs/kimi_vl_moonvit.py +32 -0
  12. sglang/srt/configs/model_config.py +32 -0
  13. sglang/srt/constrained/xgrammar_backend.py +11 -19
  14. sglang/srt/conversation.py +151 -3
  15. sglang/srt/disaggregation/decode.py +4 -1
  16. sglang/srt/disaggregation/mini_lb.py +74 -23
  17. sglang/srt/disaggregation/mooncake/conn.py +9 -18
  18. sglang/srt/disaggregation/nixl/conn.py +241 -71
  19. sglang/srt/disaggregation/utils.py +44 -1
  20. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
  21. sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
  22. sglang/srt/distributed/device_communicators/pynccl.py +2 -1
  23. sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
  24. sglang/srt/distributed/parallel_state.py +22 -1
  25. sglang/srt/entrypoints/engine.py +58 -24
  26. sglang/srt/entrypoints/http_server.py +28 -1
  27. sglang/srt/entrypoints/verl_engine.py +3 -2
  28. sglang/srt/function_call_parser.py +97 -0
  29. sglang/srt/hf_transformers_utils.py +22 -1
  30. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
  31. sglang/srt/layers/attention/flashattention_backend.py +146 -50
  32. sglang/srt/layers/attention/flashinfer_backend.py +129 -94
  33. sglang/srt/layers/attention/flashinfer_mla_backend.py +88 -30
  34. sglang/srt/layers/attention/flashmla_backend.py +3 -0
  35. sglang/srt/layers/attention/merge_state.py +46 -0
  36. sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
  37. sglang/srt/layers/attention/vision.py +290 -163
  38. sglang/srt/layers/dp_attention.py +5 -2
  39. sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
  40. sglang/srt/layers/moe/ep_moe/layer.py +120 -1
  41. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +98 -57
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  48. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -5
  49. sglang/srt/layers/quantization/__init__.py +2 -2
  50. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
  51. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
  52. sglang/srt/layers/quantization/deep_gemm.py +6 -1
  53. sglang/srt/layers/quantization/fp8.py +108 -95
  54. sglang/srt/layers/quantization/fp8_kernel.py +79 -60
  55. sglang/srt/layers/quantization/fp8_utils.py +71 -23
  56. sglang/srt/layers/quantization/kv_cache.py +3 -10
  57. sglang/srt/layers/quantization/utils.py +0 -5
  58. sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
  59. sglang/srt/layers/utils.py +35 -0
  60. sglang/srt/lora/layers.py +35 -9
  61. sglang/srt/lora/lora_manager.py +81 -35
  62. sglang/srt/managers/cache_controller.py +115 -119
  63. sglang/srt/managers/data_parallel_controller.py +52 -34
  64. sglang/srt/managers/io_struct.py +10 -0
  65. sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
  66. sglang/srt/managers/multimodal_processors/internvl.py +232 -0
  67. sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
  68. sglang/srt/managers/schedule_batch.py +44 -16
  69. sglang/srt/managers/schedule_policy.py +11 -5
  70. sglang/srt/managers/scheduler.py +291 -72
  71. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
  72. sglang/srt/managers/tokenizer_manager.py +24 -13
  73. sglang/srt/managers/tp_worker.py +60 -28
  74. sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
  75. sglang/srt/mem_cache/chunk_cache.py +2 -0
  76. sglang/srt/mem_cache/memory_pool.py +70 -36
  77. sglang/srt/model_executor/cuda_graph_runner.py +82 -19
  78. sglang/srt/model_executor/forward_batch_info.py +31 -1
  79. sglang/srt/model_executor/model_runner.py +159 -90
  80. sglang/srt/model_loader/loader.py +18 -11
  81. sglang/srt/models/clip.py +4 -4
  82. sglang/srt/models/deepseek_janus_pro.py +1 -1
  83. sglang/srt/models/deepseek_nextn.py +2 -277
  84. sglang/srt/models/deepseek_v2.py +132 -37
  85. sglang/srt/models/gemma3_mm.py +1 -1
  86. sglang/srt/models/internlm2.py +3 -0
  87. sglang/srt/models/internvl.py +670 -0
  88. sglang/srt/models/kimi_vl.py +308 -0
  89. sglang/srt/models/kimi_vl_moonvit.py +639 -0
  90. sglang/srt/models/llama.py +93 -31
  91. sglang/srt/models/llama4.py +54 -7
  92. sglang/srt/models/llama_eagle.py +4 -1
  93. sglang/srt/models/llama_eagle3.py +4 -1
  94. sglang/srt/models/minicpmv.py +1 -1
  95. sglang/srt/models/mllama.py +1 -1
  96. sglang/srt/models/phi3_small.py +16 -2
  97. sglang/srt/models/qwen2_5_vl.py +8 -4
  98. sglang/srt/models/qwen2_moe.py +8 -3
  99. sglang/srt/models/qwen2_vl.py +4 -16
  100. sglang/srt/models/qwen3_moe.py +8 -3
  101. sglang/srt/models/xiaomi_mimo.py +171 -0
  102. sglang/srt/openai_api/adapter.py +58 -62
  103. sglang/srt/openai_api/protocol.py +38 -16
  104. sglang/srt/reasoning_parser.py +2 -2
  105. sglang/srt/sampling/sampling_batch_info.py +54 -2
  106. sglang/srt/sampling/sampling_params.py +2 -0
  107. sglang/srt/server_args.py +93 -24
  108. sglang/srt/speculative/eagle_worker.py +3 -2
  109. sglang/srt/utils.py +123 -10
  110. sglang/test/runners.py +4 -0
  111. sglang/test/test_block_fp8.py +2 -2
  112. sglang/test/test_deepep_utils.py +219 -0
  113. sglang/test/test_utils.py +32 -1
  114. sglang/version.py +1 -1
  115. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +18 -9
  116. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +119 -99
  117. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
  118. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
  119. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
1
+ # Copy from deepseek-ai/DeepEP/tests/test_utils.py
2
+
3
+ import os
4
+ import sys
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torch.distributed as dist
10
+
11
+
12
+ def init_dist(local_rank: int, num_local_ranks: int):
13
+ # NOTES: you may rewrite this function with your own cluster settings
14
+ ip = os.getenv("MASTER_ADDR", "127.0.0.1")
15
+ port = int(os.getenv("MASTER_PORT", "8361"))
16
+ num_nodes = int(os.getenv("WORLD_SIZE", 1))
17
+ node_rank = int(os.getenv("RANK", 0))
18
+ assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
19
+
20
+ dist.init_process_group(
21
+ backend="nccl",
22
+ init_method=f"tcp://{ip}:{port}",
23
+ world_size=num_nodes * num_local_ranks,
24
+ rank=node_rank * num_local_ranks + local_rank,
25
+ )
26
+ torch.set_default_dtype(torch.bfloat16)
27
+ torch.set_default_device("cuda")
28
+ torch.cuda.set_device(local_rank)
29
+
30
+ return (
31
+ dist.get_rank(),
32
+ dist.get_world_size(),
33
+ dist.new_group(list(range(num_local_ranks * num_nodes))),
34
+ )
35
+
36
+
37
+ def calc_diff(x: torch.Tensor, y: torch.Tensor):
38
+ x, y = x.double() + 1, y.double() + 1
39
+ denominator = (x * x + y * y).sum()
40
+ sim = 2 * (x * y).sum() / denominator
41
+ return (1 - sim).item()
42
+
43
+
44
+ def per_token_cast_to_fp8(x: torch.Tensor):
45
+ assert x.dim() == 2 and x.size(1) % 128 == 0
46
+ m, n = x.shape
47
+ x_view = x.view(m, -1, 128)
48
+ x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
49
+ return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
50
+ m, n
51
+ ), (x_amax / 448.0).view(m, -1)
52
+
53
+
54
+ def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
55
+ x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
56
+ x_scales = x_scales.view(x_fp8.size(0), -1, 1)
57
+ return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
58
+
59
+
60
+ def inplace_unique(x: torch.Tensor, num_slots: int):
61
+ assert x.dim() == 2
62
+ mask = x < 0
63
+ x_padded = x.masked_fill(mask, num_slots)
64
+ bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
65
+ bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
66
+ bin_count = bin_count[:, :num_slots]
67
+ sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
68
+ sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
69
+ sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
70
+ x[:, :].fill_(-1)
71
+ valid_len = min(num_slots, x.size(1))
72
+ x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
73
+
74
+
75
+ def create_grouped_scores(
76
+ scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int
77
+ ):
78
+ num_tokens, num_experts = scores.shape
79
+ scores = scores.view(num_tokens, num_groups, -1)
80
+ mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
81
+ mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
82
+ return (scores * mask).view(num_tokens, num_experts)
83
+
84
+
85
+ def bench(fn, num_warmups: int = 20, num_tests: int = 30, post_fn=None):
86
+ # Flush L2 cache with 256 MB data
87
+ torch.cuda.synchronize()
88
+ cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
89
+
90
+ # Warmup
91
+ for _ in range(num_warmups):
92
+ fn()
93
+
94
+ # Flush L2
95
+ cache.zero_()
96
+
97
+ # Testing
98
+ start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
99
+ end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
100
+ for i in range(num_tests):
101
+ # Record
102
+ start_events[i].record()
103
+ fn()
104
+ end_events[i].record()
105
+ if post_fn is not None:
106
+ post_fn()
107
+ torch.cuda.synchronize()
108
+
109
+ times = np.array(
110
+ [s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)]
111
+ )[1:]
112
+ return np.average(times), np.min(times), np.max(times)
113
+
114
+
115
+ class empty_suppress:
116
+ def __enter__(self):
117
+ return self
118
+
119
+ def __exit__(self, *_):
120
+ pass
121
+
122
+
123
+ class suppress_stdout_stderr:
124
+ def __enter__(self):
125
+ self.outnull_file = open(os.devnull, "w")
126
+ self.errnull_file = open(os.devnull, "w")
127
+
128
+ self.old_stdout_fileno_undup = sys.stdout.fileno()
129
+ self.old_stderr_fileno_undup = sys.stderr.fileno()
130
+
131
+ self.old_stdout_fileno = os.dup(sys.stdout.fileno())
132
+ self.old_stderr_fileno = os.dup(sys.stderr.fileno())
133
+
134
+ self.old_stdout = sys.stdout
135
+ self.old_stderr = sys.stderr
136
+
137
+ os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
138
+ os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
139
+
140
+ sys.stdout = self.outnull_file
141
+ sys.stderr = self.errnull_file
142
+ return self
143
+
144
+ def __exit__(self, *_):
145
+ sys.stdout = self.old_stdout
146
+ sys.stderr = self.old_stderr
147
+
148
+ os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
149
+ os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
150
+
151
+ os.close(self.old_stdout_fileno)
152
+ os.close(self.old_stderr_fileno)
153
+
154
+ self.outnull_file.close()
155
+ self.errnull_file.close()
156
+
157
+
158
+ def bench_kineto(
159
+ fn,
160
+ kernel_names,
161
+ num_tests: int = 30,
162
+ suppress_kineto_output: bool = False,
163
+ trace_path: Optional[str] = None,
164
+ barrier_comm_profiling: bool = False,
165
+ ):
166
+ # Profile
167
+ suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
168
+ with suppress():
169
+ schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
170
+ with torch.profiler.profile(
171
+ activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
172
+ ) as prof:
173
+ for i in range(2):
174
+ # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
175
+ if barrier_comm_profiling:
176
+ lhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
177
+ rhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
178
+ lhs @ rhs
179
+ dist.all_reduce(torch.ones(1, dtype=torch.float, device="cuda"))
180
+ for _ in range(num_tests):
181
+ fn()
182
+ prof.step()
183
+
184
+ # Parse the profiling table
185
+ assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
186
+ is_tupled = isinstance(kernel_names, tuple)
187
+ prof_lines = (
188
+ prof.key_averages()
189
+ .table(sort_by="cuda_time_total", max_name_column_width=100)
190
+ .split("\n")
191
+ )
192
+ kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
193
+ assert all([isinstance(name, str) for name in kernel_names])
194
+ for name in kernel_names:
195
+ assert (
196
+ sum([name in line for line in prof_lines]) == 1
197
+ ), f"Errors of the kernel {name} in the profiling table"
198
+
199
+ # Save chrome traces
200
+ if trace_path is not None:
201
+ prof.export_chrome_trace(trace_path)
202
+
203
+ # Return average kernel times
204
+ units = {"ms": 1e3, "us": 1e6}
205
+ kernel_times = []
206
+ for name in kernel_names:
207
+ for line in prof_lines:
208
+ if name in line:
209
+ time_str = line.split()[-2]
210
+ for unit, scale in units.items():
211
+ if unit in time_str:
212
+ kernel_times.append(float(time_str.replace(unit, "")) / scale)
213
+ break
214
+ break
215
+ return tuple(kernel_times) if is_tupled else kernel_times[0]
216
+
217
+
218
+ def hash_tensor(t: torch.Tensor):
219
+ return t.view(torch.int64).sum().item()
sglang/test/test_utils.py CHANGED
@@ -66,9 +66,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
66
66
  )
67
67
  DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
68
68
  DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
69
+ DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-V3-0324"
69
70
  DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
70
71
  "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
71
72
  )
73
+ DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
72
74
 
73
75
  # Nightly tests
74
76
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
@@ -77,7 +79,8 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
77
79
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
78
80
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
79
81
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
80
- DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
82
+ DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
83
+ DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
81
84
 
82
85
  DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
83
86
  DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
@@ -770,6 +773,34 @@ def run_bench_offline_throughput(model, other_args):
770
773
  return output_throughput
771
774
 
772
775
 
776
+ def run_bench_one_batch_server(
777
+ model,
778
+ base_url,
779
+ server_args,
780
+ bench_args,
781
+ other_server_args,
782
+ simulate_spec_acc_lens=None,
783
+ ):
784
+ from sglang.bench_one_batch_server import run_benchmark
785
+
786
+ if simulate_spec_acc_lens is not None:
787
+ env = {**os.environ, "SIMULATE_ACC_LEN": str(simulate_spec_acc_lens)}
788
+ else:
789
+ env = None
790
+
791
+ process = popen_launch_server(
792
+ model,
793
+ base_url,
794
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
795
+ other_args=other_server_args,
796
+ env=env,
797
+ )
798
+ try:
799
+ run_benchmark(server_args=server_args, bench_args=bench_args)
800
+ finally:
801
+ kill_process_tree(process.pid)
802
+
803
+
773
804
  def lcs(X, Y):
774
805
  m = len(X)
775
806
  n = len(Y)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.6.post1"
1
+ __version__ = "0.4.6.post3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.6.post1
3
+ Version: 0.4.6.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
230
230
  Requires-Dist: ninja; extra == "runtime-common"
231
231
  Requires-Dist: orjson; extra == "runtime-common"
232
232
  Requires-Dist: packaging; extra == "runtime-common"
233
+ Requires-Dist: partial_json_parser; extra == "runtime-common"
233
234
  Requires-Dist: pillow; extra == "runtime-common"
234
235
  Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
235
236
  Requires-Dist: psutil; extra == "runtime-common"
@@ -238,20 +239,20 @@ Requires-Dist: pynvml; extra == "runtime-common"
238
239
  Requires-Dist: python-multipart; extra == "runtime-common"
239
240
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
240
241
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
241
- Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
242
+ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
242
243
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
244
  Requires-Dist: uvicorn; extra == "runtime-common"
244
245
  Requires-Dist: uvloop; extra == "runtime-common"
245
- Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
246
+ Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
247
+ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
246
248
  Provides-Extra: srt
247
249
  Requires-Dist: sglang[runtime_common]; extra == "srt"
248
- Requires-Dist: sgl-kernel==0.1.0; extra == "srt"
249
- Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
250
+ Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
251
+ Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
250
252
  Requires-Dist: torch==2.6.0; extra == "srt"
251
253
  Requires-Dist: torchvision==0.21.0; extra == "srt"
252
254
  Requires-Dist: cuda-python; extra == "srt"
253
255
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
254
- Requires-Dist: partial_json_parser; extra == "srt"
255
256
  Requires-Dist: einops; extra == "srt"
256
257
  Provides-Extra: blackwell
257
258
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
@@ -260,7 +261,6 @@ Requires-Dist: torch; extra == "blackwell"
260
261
  Requires-Dist: torchvision; extra == "blackwell"
261
262
  Requires-Dist: cuda-python; extra == "blackwell"
262
263
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
263
- Requires-Dist: partial_json_parser; extra == "blackwell"
264
264
  Requires-Dist: einops; extra == "blackwell"
265
265
  Provides-Extra: srt-hip
266
266
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
@@ -277,6 +277,9 @@ Provides-Extra: srt-cpu
277
277
  Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
278
278
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
279
279
  Requires-Dist: torch; extra == "srt-cpu"
280
+ Provides-Extra: srt-npu
281
+ Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
282
+ Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
280
283
  Provides-Extra: openai
281
284
  Requires-Dist: openai>=1.0; extra == "openai"
282
285
  Requires-Dist: tiktoken; extra == "openai"
@@ -318,6 +321,11 @@ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
318
321
  Requires-Dist: sglang[openai]; extra == "all-cpu"
319
322
  Requires-Dist: sglang[anthropic]; extra == "all-cpu"
320
323
  Requires-Dist: sglang[litellm]; extra == "all-cpu"
324
+ Provides-Extra: all-npu
325
+ Requires-Dist: sglang[srt_npu]; extra == "all-npu"
326
+ Requires-Dist: sglang[openai]; extra == "all-npu"
327
+ Requires-Dist: sglang[anthropic]; extra == "all-npu"
328
+ Requires-Dist: sglang[litellm]; extra == "all-npu"
321
329
  Provides-Extra: dev
322
330
  Requires-Dist: sglang[all]; extra == "dev"
323
331
  Requires-Dist: sglang[test]; extra == "dev"
@@ -357,6 +365,7 @@ Dynamic: license-file
357
365
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
358
366
 
359
367
  ## News
368
+ - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
360
369
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
361
370
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
362
371
  - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
@@ -382,7 +391,7 @@ The core features include:
382
391
 
383
392
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
384
393
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
385
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
394
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
386
395
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
387
396
 
388
397
  ## Getting Started
@@ -400,7 +409,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
400
409
 
401
410
  ## Adoption and Sponsorship
402
411
  The project has been deployed to large-scale production, generating trillions of tokens every day.
403
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
412
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
404
413
 
405
414
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
406
415