sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_serving.py +1 -1
  4. sglang/lang/interpreter.py +40 -1
  5. sglang/lang/ir.py +27 -0
  6. sglang/math_utils.py +8 -0
  7. sglang/srt/configs/model_config.py +6 -0
  8. sglang/srt/conversation.py +6 -0
  9. sglang/srt/disaggregation/base/__init__.py +1 -1
  10. sglang/srt/disaggregation/base/conn.py +25 -11
  11. sglang/srt/disaggregation/common/__init__.py +5 -1
  12. sglang/srt/disaggregation/common/utils.py +42 -0
  13. sglang/srt/disaggregation/decode.py +196 -51
  14. sglang/srt/disaggregation/fake/__init__.py +1 -1
  15. sglang/srt/disaggregation/fake/conn.py +15 -9
  16. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  17. sglang/srt/disaggregation/mooncake/conn.py +18 -13
  18. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  19. sglang/srt/disaggregation/nixl/conn.py +17 -12
  20. sglang/srt/disaggregation/prefill.py +128 -43
  21. sglang/srt/disaggregation/utils.py +127 -123
  22. sglang/srt/entrypoints/engine.py +15 -1
  23. sglang/srt/entrypoints/http_server.py +13 -2
  24. sglang/srt/eplb_simulator/__init__.py +1 -0
  25. sglang/srt/eplb_simulator/reader.py +51 -0
  26. sglang/srt/layers/activation.py +19 -0
  27. sglang/srt/layers/attention/aiter_backend.py +15 -2
  28. sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
  29. sglang/srt/layers/attention/flashattention_backend.py +53 -64
  30. sglang/srt/layers/attention/flashinfer_backend.py +1 -2
  31. sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
  32. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  33. sglang/srt/layers/attention/triton_backend.py +119 -119
  34. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  35. sglang/srt/layers/attention/vision.py +51 -24
  36. sglang/srt/layers/communicator.py +23 -5
  37. sglang/srt/layers/linear.py +0 -4
  38. sglang/srt/layers/logits_processor.py +0 -12
  39. sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
  40. sglang/srt/layers/moe/ep_moe/layer.py +42 -32
  41. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
  42. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
  43. sglang/srt/layers/moe/topk.py +16 -8
  44. sglang/srt/layers/pooler.py +56 -0
  45. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  46. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
  47. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  49. sglang/srt/layers/quantization/fp8_kernel.py +44 -15
  50. sglang/srt/layers/quantization/fp8_utils.py +87 -22
  51. sglang/srt/layers/radix_attention.py +2 -3
  52. sglang/srt/lora/lora_manager.py +79 -34
  53. sglang/srt/lora/mem_pool.py +4 -5
  54. sglang/srt/managers/cache_controller.py +2 -1
  55. sglang/srt/managers/io_struct.py +28 -4
  56. sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
  57. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  58. sglang/srt/managers/schedule_batch.py +39 -6
  59. sglang/srt/managers/scheduler.py +73 -17
  60. sglang/srt/managers/tokenizer_manager.py +29 -2
  61. sglang/srt/mem_cache/chunk_cache.py +1 -0
  62. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  63. sglang/srt/mem_cache/memory_pool.py +111 -407
  64. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  65. sglang/srt/mem_cache/radix_cache.py +36 -12
  66. sglang/srt/model_executor/cuda_graph_runner.py +122 -55
  67. sglang/srt/model_executor/forward_batch_info.py +14 -5
  68. sglang/srt/model_executor/model_runner.py +6 -6
  69. sglang/srt/model_loader/loader.py +8 -1
  70. sglang/srt/models/bert.py +113 -13
  71. sglang/srt/models/deepseek_v2.py +113 -155
  72. sglang/srt/models/internvl.py +46 -102
  73. sglang/srt/models/roberta.py +117 -9
  74. sglang/srt/models/vila.py +305 -0
  75. sglang/srt/openai_api/adapter.py +162 -4
  76. sglang/srt/openai_api/protocol.py +37 -1
  77. sglang/srt/sampling/sampling_batch_info.py +24 -0
  78. sglang/srt/sampling/sampling_params.py +2 -0
  79. sglang/srt/server_args.py +318 -233
  80. sglang/srt/speculative/build_eagle_tree.py +1 -1
  81. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
  82. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
  83. sglang/srt/speculative/eagle_utils.py +389 -109
  84. sglang/srt/speculative/eagle_worker.py +134 -43
  85. sglang/srt/two_batch_overlap.py +4 -2
  86. sglang/srt/utils.py +58 -0
  87. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  88. sglang/test/runners.py +38 -3
  89. sglang/test/test_block_fp8.py +1 -0
  90. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  91. sglang/test/test_block_fp8_ep.py +1 -0
  92. sglang/test/test_utils.py +3 -1
  93. sglang/utils.py +9 -0
  94. sglang/version.py +1 -1
  95. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
  96. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
  97. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
  98. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  99. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,252 @@
1
+ import itertools
2
+ import os
3
+ import unittest
4
+ from typing import List, Tuple
5
+
6
+ import torch
7
+ from deep_gemm import fp8_gemm_nt
8
+
9
+ from sglang.test.test_utils import CustomTestCase
10
+
11
+ _is_cuda = torch.cuda.is_available() and torch.version.cuda
12
+
13
+
14
+ # Modify form DeepGEMM Blackwell
15
+ def ceil_div(x: int, y: int) -> int:
16
+ return (x + y - 1) // y
17
+
18
+
19
+ def align(x: int, y: int) -> int:
20
+ return ceil_div(x, y) * y
21
+
22
+
23
+ def per_token_group_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
24
+ assert x.dim() == 2 and x.size(1) % 128 == 0
25
+ m, n = x.shape
26
+ x_view = x.view(m, -1, 128)
27
+ x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
28
+ sf = x_amax / 448.0
29
+ return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
30
+
31
+
32
+ def per_block_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
33
+ assert x.dim() == 2
34
+ m, n = x.shape
35
+ x_padded = torch.zeros(
36
+ (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
37
+ )
38
+ x_padded[:m, :n] = x
39
+ x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
40
+ x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
41
+ sf = x_amax / 448.0
42
+ x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
43
+ return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
44
+ x_view.size(0), x_view.size(2)
45
+ )
46
+
47
+
48
+ def ceil_to_ue8m0(x: torch.Tensor):
49
+ assert x.view(-1).amax().item() > 0
50
+ return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
51
+
52
+
53
+ def per_token_group_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
54
+ assert x.dim() == 2 and x.size(1) % 128 == 0
55
+ m, n = x.shape
56
+ x_view = x.view(m, -1, 128)
57
+ x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
58
+ sf = ceil_to_ue8m0(x_amax / 448.0)
59
+ return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
60
+
61
+
62
+ def per_block_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
63
+ assert x.dim() == 2
64
+ m, n = x.shape
65
+ x_padded = torch.zeros(
66
+ (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
67
+ )
68
+ x_padded[:m, :n] = x
69
+ x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
70
+ x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
71
+ sf = ceil_to_ue8m0(x_amax / 448.0)
72
+ x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
73
+ return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
74
+ x_view.size(0), x_view.size(2)
75
+ )
76
+
77
+
78
+ # For test
79
+ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
80
+ """This function performs matrix multiplication with block-wise quantization using native torch.
81
+
82
+ It takes two input tensors `A` and `B` with scales `As` and `Bs`.
83
+ The output is returned in the specified `output_dtype`.
84
+ """
85
+
86
+ A = A.to(torch.float32)
87
+ B = B.to(torch.float32)
88
+ assert A.shape[-1] == B.shape[-1]
89
+ assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
90
+ assert len(block_size) == 2
91
+ block_n, block_k = block_size[0], block_size[1]
92
+ assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
93
+ assert A.shape[:-1] == As.shape[:-1]
94
+
95
+ M = A.numel() // A.shape[-1]
96
+ N, K = B.shape
97
+ origin_C_shape = A.shape[:-1] + (N,)
98
+ A = A.reshape(M, A.shape[-1])
99
+ As = As.reshape(M, As.shape[-1])
100
+ n_tiles = (N + block_n - 1) // block_n
101
+ k_tiles = (K + block_k - 1) // block_k
102
+ assert n_tiles == Bs.shape[0]
103
+ assert k_tiles == Bs.shape[1]
104
+
105
+ C_shape = (M, N)
106
+ C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
107
+
108
+ A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
109
+ B_tiles = [
110
+ [
111
+ B[
112
+ j * block_n : min((j + 1) * block_n, N),
113
+ i * block_k : min((i + 1) * block_k, K),
114
+ ]
115
+ for i in range(k_tiles)
116
+ ]
117
+ for j in range(n_tiles)
118
+ ]
119
+ C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
120
+ As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
121
+
122
+ for i in range(k_tiles):
123
+ for j in range(n_tiles):
124
+ a = A_tiles[i]
125
+ b = B_tiles[j][i]
126
+ c = C_tiles[j]
127
+ s = As_tiles[i] * Bs[j][i]
128
+ c[:, :] += torch.matmul(a, b.t()) * s
129
+
130
+ C = C.reshape(origin_C_shape).to(output_dtype)
131
+ return C
132
+
133
+
134
+ def block_quant_dequant(
135
+ x_q_block: torch.Tensor,
136
+ x_s: torch.Tensor,
137
+ block_size: List[int],
138
+ dtype: torch.dtype,
139
+ ) -> torch.Tensor:
140
+ """This function converts block-wise quantization to unquantized.
141
+ The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
142
+ and the block size.
143
+ The output is an unquantized tensor with dtype.
144
+ """
145
+ block_n, block_k = block_size[0], block_size[1]
146
+ n, k = x_q_block.shape
147
+ n_tiles = (n + block_n - 1) // block_n
148
+ k_tiles = (k + block_k - 1) // block_k
149
+ assert n_tiles == x_s.shape[0]
150
+ assert k_tiles == x_s.shape[1]
151
+
152
+ x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
153
+
154
+ for j in range(n_tiles):
155
+ for i in range(k_tiles):
156
+ x_q_block_tile = x_q_block[
157
+ j * block_n : min((j + 1) * block_n, n),
158
+ i * block_k : min((i + 1) * block_k, k),
159
+ ]
160
+ x_dq_block_tile = x_dq_block[
161
+ j * block_n : min((j + 1) * block_n, n),
162
+ i * block_k : min((i + 1) * block_k, k),
163
+ ]
164
+ x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
165
+
166
+ return x_dq_block
167
+
168
+
169
+ class TestDeepGemmBlackwell(CustomTestCase):
170
+
171
+ if not _is_cuda:
172
+ OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
173
+ M = [1, 7, 83, 512, 2048]
174
+ NKs = [
175
+ (N, K)
176
+ for N in [128, 512, 1024, 4096, 7748, 13824]
177
+ for K in [256, 4096, 5120, 3884, 13824]
178
+ ]
179
+ # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
180
+ BLOCK_SIZE = [[128, 128]]
181
+ SEEDS = [0]
182
+ else:
183
+ # use practical shape in DeepSeek V3 for test
184
+ OUT_DTYPES = [torch.bfloat16]
185
+ M = [64, 128, 512, 1024, 4096]
186
+ NKs = [
187
+ (2112, 7168),
188
+ (1536, 7168),
189
+ # (3072, 1536),
190
+ # (24576, 7168),
191
+ # (4096, 512),
192
+ # (7168, 2048),
193
+ # (4608, 7168),
194
+ # (512, 7168),
195
+ # (7168, 2304),
196
+ # (7168, 512),
197
+ ]
198
+ BLOCK_SIZE = [[128, 128]]
199
+ SEEDS = [0]
200
+
201
+ @classmethod
202
+ def setUpClass(cls):
203
+ if not torch.cuda.is_available():
204
+ raise unittest.SkipTest("CUDA is not available")
205
+ torch.set_default_device("cuda")
206
+
207
+ def _test_deep_gemm_blackwell(self, M, NK, block_size, out_dtype, seed):
208
+ N, K = NK
209
+ torch.manual_seed(seed)
210
+
211
+ A = torch.empty((M, K), dtype=torch.bfloat16).normal_(0, 0.2)
212
+ B = torch.empty((N, K), dtype=torch.bfloat16).normal_(0, 0.2)
213
+
214
+ A_q, A_s = per_token_group_quant_fp8(A)
215
+ B_q, B_s = per_block_quant_fp8(B)
216
+
217
+ A_dq = block_quant_dequant(A_q, A_s, [1, block_size[1]], out_dtype)
218
+ B_dq = block_quant_dequant(B_q, B_s, block_size, out_dtype)
219
+
220
+ A_qu = per_token_group_quant_mxfp8(A_dq)
221
+ B_qu = per_block_quant_mxfp8(B_dq)
222
+ out = None
223
+
224
+ with torch.inference_mode():
225
+ ref_out = native_w8a8_block_fp8_matmul(
226
+ A_q, B_q, A_s, B_s, block_size, out_dtype
227
+ )
228
+ out = torch.empty_like(ref_out)
229
+ fp8_gemm_nt(A_qu, B_qu, out)
230
+
231
+ torch.testing.assert_close(out, ref_out, atol=1e-1, rtol=1e-2)
232
+
233
+ def test_deep_gemm_blackwell(self):
234
+ for params in itertools.product(
235
+ self.M,
236
+ self.NKs,
237
+ self.BLOCK_SIZE,
238
+ self.OUT_DTYPES,
239
+ self.SEEDS,
240
+ ):
241
+ with self.subTest(
242
+ M=params[0],
243
+ NKs=params[1],
244
+ block_size=params[2],
245
+ out_dtype=params[3],
246
+ seed=params[4],
247
+ ):
248
+ self._test_deep_gemm_blackwell(*params)
249
+
250
+
251
+ if __name__ == "__main__":
252
+ unittest.main(verbosity=2)
@@ -84,6 +84,7 @@ def ep_moe(
84
84
  top_k,
85
85
  hidden_states.shape[1],
86
86
  BLOCK_SIZE=512,
87
+ use_per_token_if_dynamic=True,
87
88
  )
88
89
 
89
90
  seg_indptr_cur_rank = seg_indptr[start_expert_id : end_expert_id + 2]
sglang/test/test_utils.py CHANGED
@@ -41,6 +41,8 @@ DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
41
41
  DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
42
42
 
43
43
  # MLA test models
44
+ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
45
+ DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST = "cross-encoder/ms-marco-MiniLM-L6-v2"
44
46
  DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
45
47
  DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
46
48
  DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
@@ -85,7 +87,7 @@ DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
85
87
  DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
86
88
  DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
87
89
 
88
- DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
90
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
89
91
 
90
92
 
91
93
  def is_in_ci():
sglang/utils.py CHANGED
@@ -512,3 +512,12 @@ async def async_stream_and_merge(llm, prompt, sampling_params):
512
512
  cleaned_chunk = trim_overlap(final_text, chunk_text)
513
513
  final_text += cleaned_chunk
514
514
  yield cleaned_chunk # yield the non-overlapping portion
515
+
516
+
517
+ def resolve_obj_by_qualname(qualname: str) -> Any:
518
+ """
519
+ Resolve an object by its fully qualified name.
520
+ """
521
+ module_name, obj_name = qualname.rsplit(".", 1)
522
+ module = importlib.import_module(module_name)
523
+ return getattr(module, obj_name)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.7"
1
+ __version__ = "0.4.7.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.7
3
+ Version: 0.4.7.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -248,7 +248,7 @@ Requires-Dist: uvloop; extra == "runtime-common"
248
248
  Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
249
249
  Provides-Extra: srt
250
250
  Requires-Dist: sglang[runtime_common]; extra == "srt"
251
- Requires-Dist: sgl-kernel==0.1.7; extra == "srt"
251
+ Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
252
252
  Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
253
253
  Requires-Dist: torch==2.7.1; extra == "srt"
254
254
  Requires-Dist: torchaudio==2.7.1; extra == "srt"
@@ -371,7 +371,7 @@ Dynamic: license-file
371
371
 
372
372
  --------------------------------------------------------------------------------
373
373
 
374
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
374
+ | [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
375
375
  | [**Documentation**](https://docs.sglang.ai/)
376
376
  | [**Join Slack**](https://slack.sglang.ai/)
377
377
  | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
@@ -403,7 +403,7 @@ SGLang is a fast serving framework for large language models and vision language
403
403
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
404
404
  The core features include:
405
405
 
406
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
406
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
407
407
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
408
408
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
409
409
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -422,7 +422,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
422
422
  [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
423
423
 
424
424
  ## Adoption and Sponsorship
425
- SGLang has been deployed at large scale, serving trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
425
+ SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
426
426
 
427
427
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
428
428