sglang 0.5.2rc1__py3-none-any.whl → 0.5.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. sglang/lang/interpreter.py +1 -1
  2. sglang/srt/configs/internvl.py +6 -0
  3. sglang/srt/disaggregation/mini_lb.py +2 -2
  4. sglang/srt/distributed/parallel_state.py +43 -40
  5. sglang/srt/entrypoints/http_server.py +5 -1
  6. sglang/srt/entrypoints/openai/protocol.py +3 -3
  7. sglang/srt/entrypoints/openai/serving_chat.py +3 -3
  8. sglang/srt/entrypoints/openai/serving_completions.py +3 -1
  9. sglang/srt/entrypoints/openai/serving_embedding.py +1 -1
  10. sglang/srt/entrypoints/openai/serving_responses.py +1 -1
  11. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  12. sglang/srt/layers/attention/aiter_backend.py +93 -68
  13. sglang/srt/layers/communicator.py +45 -7
  14. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  15. sglang/srt/layers/moe/utils.py +0 -1
  16. sglang/srt/layers/quantization/modelopt_quant.py +35 -2
  17. sglang/srt/layers/quantization/mxfp4.py +4 -1
  18. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  19. sglang/srt/layers/quantization/quark/utils.py +97 -0
  20. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  21. sglang/srt/layers/rocm_linear_utils.py +44 -0
  22. sglang/srt/layers/rotary_embedding.py +0 -18
  23. sglang/srt/managers/cache_controller.py +42 -39
  24. sglang/srt/managers/multi_tokenizer_mixin.py +4 -0
  25. sglang/srt/managers/schedule_policy.py +3 -2
  26. sglang/srt/managers/scheduler.py +4 -100
  27. sglang/srt/managers/scheduler_metrics_mixin.py +113 -7
  28. sglang/srt/managers/template_manager.py +3 -3
  29. sglang/srt/managers/tokenizer_manager.py +1 -0
  30. sglang/srt/mem_cache/allocator.py +1 -1
  31. sglang/srt/mem_cache/hicache_storage.py +15 -10
  32. sglang/srt/mem_cache/hiradix_cache.py +5 -5
  33. sglang/srt/mem_cache/memory_pool_host.py +16 -11
  34. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +10 -2
  35. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +32 -13
  36. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  37. sglang/srt/metrics/collector.py +12 -4
  38. sglang/srt/metrics/utils.py +48 -0
  39. sglang/srt/model_executor/forward_batch_info.py +16 -17
  40. sglang/srt/model_executor/model_runner.py +1 -1
  41. sglang/srt/models/deepseek_v2.py +240 -36
  42. sglang/srt/models/glm4_moe.py +10 -1
  43. sglang/srt/models/internvl.py +28 -0
  44. sglang/srt/models/minicpmv.py +165 -3
  45. sglang/srt/models/qwen2_moe.py +4 -1
  46. sglang/srt/models/qwen3.py +8 -2
  47. sglang/srt/models/qwen3_moe.py +39 -8
  48. sglang/srt/models/torch_native_llama.py +1 -1
  49. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  50. sglang/srt/server_args.py +79 -2
  51. sglang/srt/speculative/eagle_worker.py +158 -112
  52. sglang/srt/utils.py +12 -0
  53. sglang/test/few_shot_gsm8k.py +1 -0
  54. sglang/utils.py +1 -0
  55. sglang/version.py +1 -1
  56. {sglang-0.5.2rc1.dist-info → sglang-0.5.2rc2.dist-info}/METADATA +1 -1
  57. {sglang-0.5.2rc1.dist-info → sglang-0.5.2rc2.dist-info}/RECORD +65 -61
  58. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  59. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  60. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  61. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  62. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  63. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  64. {sglang-0.5.2rc1.dist-info → sglang-0.5.2rc2.dist-info}/WHEEL +0 -0
  65. {sglang-0.5.2rc1.dist-info → sglang-0.5.2rc2.dist-info}/licenses/LICENSE +0 -0
  66. {sglang-0.5.2rc1.dist-info → sglang-0.5.2rc2.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
1
- import hashlib
2
1
  import json
3
2
  import logging
4
3
  import os
@@ -6,10 +5,8 @@ import uuid
6
5
  from dataclasses import dataclass
7
6
  from typing import Any, List, Optional
8
7
 
9
- import numpy as np
10
8
  import torch
11
9
 
12
- from sglang.srt.distributed import get_tensor_model_parallel_rank
13
10
  from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
14
11
 
15
12
  DEFAULT_GLOBAL_SEGMENT_SIZE = 4 * 1024 * 1024 * 1024 # 4 GiB
@@ -154,21 +151,36 @@ class MooncakeStore(HiCacheStorage):
154
151
  target_location: Optional[List[int]] = None,
155
152
  target_sizes: Optional[List[int]] = None,
156
153
  ) -> bool:
157
- return self.batch_set([key], [value], [target_location], [target_sizes])
154
+ # Only support zero copy set for now
155
+ assert target_location is not None and target_sizes is not None
156
+ exist_result = self._batch_exist([key])
157
+ if exist_result[0] == 1:
158
+ return True
159
+ put_result = self._put_batch_zero_copy_impl(
160
+ [key], [target_location], [target_sizes]
161
+ )
162
+ return put_result[0] == 0
158
163
 
159
164
  def batch_set(
160
165
  self,
161
166
  keys: List[str],
162
167
  values: Optional[List[torch.Tensor]] = None,
163
- target_location: Optional[List[int]] = None,
168
+ target_locations: Optional[List[int]] = None,
164
169
  target_sizes: Optional[List[int]] = None,
165
170
  ) -> bool:
166
- assert len(keys) == len(target_location) == len(target_sizes)
171
+ # Only support zero copy set for now
172
+ assert target_locations is not None and target_sizes is not None
173
+ assert len(keys) == len(target_locations) == len(target_sizes)
174
+
167
175
  if len(keys) == 0:
168
176
  return False
169
177
 
170
178
  for i in range(len(keys)):
171
- if keys[i] is None or target_location[i] is None or target_sizes[i] is None:
179
+ if (
180
+ keys[i] is None
181
+ or target_locations[i] is None
182
+ or target_sizes[i] is None
183
+ ):
172
184
  return False
173
185
 
174
186
  exist_result = self._batch_exist(keys)
@@ -179,7 +191,7 @@ class MooncakeStore(HiCacheStorage):
179
191
  for i in range(len(keys)):
180
192
  if exist_result[i] != 1:
181
193
  set_keys.append(keys[i])
182
- set_target_locations.append(target_location[i])
194
+ set_target_locations.append(target_locations[i])
183
195
  set_target_sizes.append(target_sizes[i])
184
196
  set_indices.append(i)
185
197
  # Only set non-existing keys to storage
@@ -204,18 +216,24 @@ class MooncakeStore(HiCacheStorage):
204
216
  target_location: Optional[Any] = None,
205
217
  target_sizes: Optional[Any] = None,
206
218
  ) -> bool:
207
- return self.batch_get([key], [target_location], [target_sizes]) == 1
219
+ assert target_location is not None and target_sizes is not None
220
+ get_result = self._get_batch_zero_copy_impl(
221
+ [key], [target_location], [target_sizes]
222
+ )
223
+ return get_result[0] >= 0
208
224
 
209
225
  def batch_get(
210
226
  self,
211
227
  keys: List[str],
212
- target_location: Optional[Any] = None,
228
+ target_locations: Optional[Any] = None,
213
229
  target_sizes: Optional[Any] = None,
214
230
  ) -> int:
215
- assert len(keys) == len(target_location) == len(target_sizes)
231
+ assert len(keys) == len(target_locations) == len(target_sizes)
216
232
  if len(keys) == 0:
217
233
  return 0
218
- get_result = self._get_batch_zero_copy_impl(keys, target_location, target_sizes)
234
+ get_result = self._get_batch_zero_copy_impl(
235
+ keys, target_locations, target_sizes
236
+ )
219
237
  if self.is_mla_backend:
220
238
  key_multiplier = 1
221
239
  else:
@@ -226,7 +244,8 @@ class MooncakeStore(HiCacheStorage):
226
244
  return len(keys) // key_multiplier
227
245
 
228
246
  def exists(self, key) -> bool:
229
- return self.batch_exists([key]) > 0
247
+ exist_result = self._batch_exist([key])
248
+ return exist_result[0] == 1
230
249
 
231
250
  def batch_exists(self, keys) -> int:
232
251
  if self.is_mla_backend:
@@ -0,0 +1,161 @@
1
+ import logging
2
+ import uuid
3
+
4
+ import torch
5
+ from mooncake_store import MooncakeStore
6
+
7
+ from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
8
+
9
+ logging.basicConfig(
10
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
11
+ )
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def generate_batch_query_keys(kv_num: int, config: HiCacheStorageConfig):
16
+ keys = []
17
+ for _ in range(kv_num):
18
+ key = "test_" + str(uuid.uuid4())
19
+ keys.append(key)
20
+ set_keys = []
21
+ for key in keys:
22
+ if config.is_mla_model:
23
+ set_keys.append(key + "_k")
24
+ else:
25
+ set_keys.append(key + f"_{config.tp_rank}_k")
26
+ set_keys.append(key + f"_{config.tp_rank}_v")
27
+ get_keys = set_keys
28
+ exist_keys = keys
29
+ return set_keys, get_keys, exist_keys
30
+
31
+
32
+ def test_single_operation():
33
+ """Test the set API with a single key-value pair."""
34
+ print("=" * 100)
35
+ print("Testing single operation")
36
+
37
+ buffer_size = 1024 * 1024 * 16 # 16MB
38
+ value_elements = 1024
39
+ store = MooncakeStore()
40
+ buffer = torch.randn(buffer_size, dtype=torch.float32)
41
+ store.register_buffer(buffer)
42
+ value_size = value_elements * buffer.element_size()
43
+
44
+ key = str(uuid.uuid4())
45
+ set_slice = buffer[:value_elements]
46
+ get_slice = buffer[value_elements : 2 * value_elements]
47
+ set_location = set_slice.data_ptr()
48
+ get_location = get_slice.data_ptr()
49
+
50
+ # Test set operation
51
+ result = store.set(key, target_location=set_location, target_sizes=value_size)
52
+ assert result is True, f"❌set operation failed for key: {key}"
53
+
54
+ # Test exists operation
55
+ assert store.exists(key), f"❌key {key} should exist after set operation"
56
+
57
+ # Test get operation
58
+ result = store.get(key, target_location=get_location, target_sizes=value_size)
59
+ assert result is True, f"❌get operation failed for key: {key}"
60
+
61
+ # Compare the data using proper tensor indices
62
+ assert torch.allclose(
63
+ set_slice, get_slice, atol=1e-6
64
+ ), f"❌get operation failed for key: {key}"
65
+
66
+ logger.info(f"✅ Single operation passed")
67
+
68
+
69
+ def test_batch_operation(config: HiCacheStorageConfig):
70
+ """Test the batch set/get APIs with multiple key-value pairs."""
71
+ print("=" * 100)
72
+ print(f"Testing batch operation with config: {config}")
73
+
74
+ buffer_size = 1024 * 1024 * 16 # 16MB
75
+ value_elements = 256
76
+ kv_num = 13
77
+ store = MooncakeStore(config)
78
+ buffer = torch.randn(buffer_size, dtype=torch.float32)
79
+ store.register_buffer(buffer)
80
+ value_size = value_elements * buffer.element_size()
81
+
82
+ set_keys, get_keys, exist_keys = generate_batch_query_keys(kv_num, config)
83
+ set_slices = [
84
+ buffer[i * value_elements : (i + 1) * value_elements]
85
+ for i in range(len(set_keys))
86
+ ]
87
+ set_locations = [set_slice.data_ptr() for set_slice in set_slices]
88
+ target_sizes = [value_size for _ in range(len(set_keys))]
89
+
90
+ # Test batch set operation
91
+ result = store.batch_set(
92
+ set_keys, target_locations=set_locations, target_sizes=target_sizes
93
+ )
94
+ assert result is True, f"❌batch set operation failed"
95
+
96
+ # Test batch exists operation
97
+ assert store.batch_exists(
98
+ exist_keys
99
+ ), f"❌keys should exist after batch set operation"
100
+
101
+ # Test batch get operation
102
+ get_slices = [
103
+ buffer[
104
+ (len(set_keys) + i)
105
+ * value_elements : (len(set_keys) + i + 1)
106
+ * value_elements
107
+ ]
108
+ for i in range(len(get_keys))
109
+ ]
110
+ get_locations = [get_slice.data_ptr() for get_slice in get_slices]
111
+ result = store.batch_get(
112
+ get_keys, target_locations=get_locations, target_sizes=target_sizes
113
+ )
114
+ assert result == kv_num, f"❌batch get operation failed"
115
+ for i in range(len(get_keys)):
116
+ assert torch.allclose(
117
+ set_slices[i], get_slices[i], atol=1e-6
118
+ ), f"❌batch get operation failed for key: {get_keys[i]}"
119
+
120
+ logger.info(f"✅ Batch operation passed")
121
+
122
+
123
+ if __name__ == "__main__":
124
+ test_single_operation()
125
+ test_batch_operation(
126
+ HiCacheStorageConfig(
127
+ is_mla_model=False,
128
+ tp_rank=0,
129
+ tp_size=1,
130
+ model_name=None,
131
+ is_page_first_layout=True,
132
+ )
133
+ )
134
+ test_batch_operation(
135
+ HiCacheStorageConfig(
136
+ is_mla_model=True,
137
+ tp_rank=0,
138
+ tp_size=1,
139
+ model_name=None,
140
+ is_page_first_layout=True,
141
+ )
142
+ )
143
+ test_batch_operation(
144
+ HiCacheStorageConfig(
145
+ is_mla_model=False,
146
+ tp_rank=1,
147
+ tp_size=4,
148
+ model_name=None,
149
+ is_page_first_layout=True,
150
+ )
151
+ )
152
+ test_batch_operation(
153
+ HiCacheStorageConfig(
154
+ is_mla_model=True,
155
+ tp_rank=3,
156
+ tp_size=8,
157
+ model_name=None,
158
+ is_page_first_layout=True,
159
+ )
160
+ )
161
+ logger.info(f"✅ All tests passed")
@@ -18,6 +18,8 @@ from dataclasses import dataclass
18
18
  from enum import Enum
19
19
  from typing import Dict, List, Optional, Union
20
20
 
21
+ from sglang.srt.metrics.utils import generate_buckets
22
+ from sglang.srt.server_args import ServerArgs
21
23
  from sglang.srt.utils import get_bool_env_var
22
24
 
23
25
  SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
@@ -309,6 +311,7 @@ class SchedulerMetricsCollector:
309
311
  class TokenizerMetricsCollector:
310
312
  def __init__(
311
313
  self,
314
+ server_args: ServerArgs,
312
315
  labels: Dict[str, str],
313
316
  bucket_time_to_first_token: Optional[List[float]] = None,
314
317
  bucket_inter_token_latency: Optional[List[float]] = None,
@@ -334,7 +337,7 @@ class TokenizerMetricsCollector:
334
337
  )
335
338
 
336
339
  if collect_tokens_histogram:
337
- bucket_prompt_tokens = [
340
+ default_bucket_prompt_tokens = [
338
341
  100,
339
342
  300,
340
343
  500,
@@ -363,9 +366,11 @@ class TokenizerMetricsCollector:
363
366
  name="sglang:prompt_tokens_histogram",
364
367
  documentation="Histogram of prompt token length.",
365
368
  labelnames=labels.keys(),
366
- buckets=bucket_prompt_tokens,
369
+ buckets=generate_buckets(
370
+ server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
371
+ ),
367
372
  )
368
- bucket_generation_tokens = [
373
+ default_bucket_generation_tokens = [
369
374
  100,
370
375
  300,
371
376
  500,
@@ -390,7 +395,10 @@ class TokenizerMetricsCollector:
390
395
  name="sglang:generation_tokens_histogram",
391
396
  documentation="Histogram of generation token length.",
392
397
  labelnames=labels.keys(),
393
- buckets=bucket_generation_tokens,
398
+ buckets=generate_buckets(
399
+ server_args.generation_tokens_buckets,
400
+ default_bucket_generation_tokens,
401
+ ),
394
402
  )
395
403
 
396
404
  self.cached_tokens_total = Counter(
@@ -0,0 +1,48 @@
1
+ # Copyright 2023-2025 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ """Utilities for Prometheus Metrics."""
15
+ import math
16
+ from typing import List
17
+
18
+
19
+ def two_sides_exponential_buckets(
20
+ middle: float, base: float, count: int
21
+ ) -> List[float]:
22
+ buckets = []
23
+ half_count = math.ceil(count / 2)
24
+ distance = 1
25
+ buckets.append(middle)
26
+ for i in range(half_count):
27
+ distance *= base
28
+ buckets.append(middle + distance)
29
+ buckets.append(max(0, middle - distance))
30
+ return sorted(set(buckets))
31
+
32
+
33
+ def generate_buckets(
34
+ buckets_rule: List[str], default_buckets: List[float]
35
+ ) -> List[float]:
36
+ if not buckets_rule:
37
+ buckets_rule = ["default"]
38
+
39
+ assert len(buckets_rule) > 0
40
+ rule = buckets_rule[0]
41
+ if rule == "tse":
42
+ middle, base, count = buckets_rule[1:]
43
+ assert float(base) > 1.0, "Base must be greater than 1.0"
44
+ return two_sides_exponential_buckets(float(middle), float(base), int(count))
45
+ if rule == "default":
46
+ return sorted(set(default_buckets))
47
+ assert rule == "customer"
48
+ return sorted(set([float(x) for x in buckets_rule[1:]]))
@@ -516,24 +516,23 @@ class ForwardBatch:
516
516
  for batch_idx in range(batch_size):
517
517
  mm_input = batch.multimodal_inputs[batch_idx]
518
518
  if self.forward_mode.is_decode():
519
- mrope_position_deltas = (
520
- [0]
521
- if mm_input is None
522
- else flatten_nested_list(mm_input.mrope_position_delta.tolist())
523
- )
524
- next_input_positions = []
525
- for mrope_position_delta in mrope_position_deltas:
526
- # batched deltas needs to be processed separately
527
- # Convert list of lists to tensor with shape [3, seq_len]
528
- next_input_positions += [
529
- MRotaryEmbedding.get_next_input_positions(
530
- mrope_position_delta,
531
- int(self.seq_lens[batch_idx]) - 1,
532
- int(self.seq_lens[batch_idx]),
533
- )
534
- ]
535
519
  # 3 * N
536
- mrope_positions_list[batch_idx] = torch.cat(next_input_positions, dim=1)
520
+ if mm_input is None:
521
+ mrope_positions_list[batch_idx] = torch.full(
522
+ (3, 1),
523
+ self.seq_lens[batch_idx] - 1,
524
+ dtype=torch.int64,
525
+ device=model_runner.device,
526
+ )
527
+ else:
528
+ mrope_position_deltas = mm_input.mrope_position_delta.flatten().to(
529
+ model_runner.device, non_blocking=True
530
+ )
531
+ mrope_positions_list[batch_idx] = (
532
+ (mrope_position_deltas + self.seq_lens[batch_idx] - 1)
533
+ .unsqueeze(0)
534
+ .repeat(3, 1)
535
+ )
537
536
  elif self.forward_mode.is_extend():
538
537
  extend_seq_len, extend_prefix_len = (
539
538
  batch.extend_seq_lens[batch_idx],
@@ -1655,7 +1655,7 @@ class ModelRunner:
1655
1655
 
1656
1656
  def apply_torch_tp(self):
1657
1657
  logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
1658
- from sglang.srt.model_parallel import tensor_parallel
1658
+ from sglang.srt.layers.model_parallel import tensor_parallel
1659
1659
 
1660
1660
  device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
1661
1661
  tensor_parallel(self.model, device_mesh)