sglang 0.5.2rc0__py3-none-any.whl → 0.5.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. sglang/lang/interpreter.py +1 -1
  2. sglang/srt/configs/internvl.py +6 -0
  3. sglang/srt/configs/model_config.py +2 -1
  4. sglang/srt/disaggregation/mini_lb.py +2 -2
  5. sglang/srt/distributed/parallel_state.py +46 -41
  6. sglang/srt/entrypoints/engine.py +1 -1
  7. sglang/srt/entrypoints/http_server.py +5 -1
  8. sglang/srt/entrypoints/openai/protocol.py +3 -3
  9. sglang/srt/entrypoints/openai/serving_chat.py +3 -3
  10. sglang/srt/entrypoints/openai/serving_completions.py +3 -1
  11. sglang/srt/entrypoints/openai/serving_embedding.py +1 -1
  12. sglang/srt/entrypoints/openai/serving_responses.py +1 -1
  13. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  14. sglang/srt/layers/attention/aiter_backend.py +93 -68
  15. sglang/srt/layers/communicator.py +45 -7
  16. sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
  17. sglang/srt/layers/moe/ep_moe/layer.py +2 -7
  18. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  19. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
  21. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  22. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
  23. sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
  24. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  25. sglang/srt/layers/moe/utils.py +0 -1
  26. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +8 -0
  27. sglang/srt/layers/quantization/modelopt_quant.py +35 -2
  28. sglang/srt/layers/quantization/mxfp4.py +4 -1
  29. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  30. sglang/srt/layers/quantization/quark/utils.py +97 -0
  31. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  32. sglang/srt/layers/quantization/w4afp8.py +30 -25
  33. sglang/srt/layers/rocm_linear_utils.py +44 -0
  34. sglang/srt/layers/rotary_embedding.py +0 -18
  35. sglang/srt/managers/cache_controller.py +42 -39
  36. sglang/srt/managers/detokenizer_manager.py +0 -34
  37. sglang/srt/managers/multi_tokenizer_mixin.py +48 -6
  38. sglang/srt/managers/schedule_policy.py +3 -2
  39. sglang/srt/managers/scheduler.py +7 -100
  40. sglang/srt/managers/scheduler_metrics_mixin.py +113 -7
  41. sglang/srt/managers/template_manager.py +3 -3
  42. sglang/srt/managers/tokenizer_manager.py +1 -0
  43. sglang/srt/mem_cache/allocator.py +1 -1
  44. sglang/srt/mem_cache/hicache_storage.py +15 -10
  45. sglang/srt/mem_cache/hiradix_cache.py +16 -0
  46. sglang/srt/mem_cache/memory_pool_host.py +18 -11
  47. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  48. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +35 -6
  49. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +32 -13
  50. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  51. sglang/srt/metrics/collector.py +12 -4
  52. sglang/srt/metrics/utils.py +48 -0
  53. sglang/srt/model_executor/forward_batch_info.py +16 -17
  54. sglang/srt/model_executor/model_runner.py +1 -1
  55. sglang/srt/models/deepseek_v2.py +245 -36
  56. sglang/srt/models/glm4_moe.py +10 -1
  57. sglang/srt/models/gpt_oss.py +5 -4
  58. sglang/srt/models/internvl.py +28 -0
  59. sglang/srt/models/longcat_flash.py +26 -15
  60. sglang/srt/models/longcat_flash_nextn.py +23 -15
  61. sglang/srt/models/minicpmv.py +165 -3
  62. sglang/srt/models/qwen2_moe.py +4 -1
  63. sglang/srt/models/qwen3.py +8 -2
  64. sglang/srt/models/qwen3_moe.py +39 -8
  65. sglang/srt/models/torch_native_llama.py +1 -1
  66. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  67. sglang/srt/server_args.py +79 -2
  68. sglang/srt/speculative/eagle_worker.py +158 -112
  69. sglang/srt/utils.py +12 -10
  70. sglang/test/few_shot_gsm8k.py +1 -0
  71. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  72. sglang/utils.py +1 -0
  73. sglang/version.py +1 -1
  74. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/METADATA +2 -2
  75. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/RECORD +83 -76
  76. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  77. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  78. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  79. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  80. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  81. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  82. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/WHEEL +0 -0
  83. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/licenses/LICENSE +0 -0
  84. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
1
+ import logging
2
+ import uuid
3
+
4
+ import torch
5
+ from mooncake_store import MooncakeStore
6
+
7
+ from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
8
+
9
+ logging.basicConfig(
10
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
11
+ )
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def generate_batch_query_keys(kv_num: int, config: HiCacheStorageConfig):
16
+ keys = []
17
+ for _ in range(kv_num):
18
+ key = "test_" + str(uuid.uuid4())
19
+ keys.append(key)
20
+ set_keys = []
21
+ for key in keys:
22
+ if config.is_mla_model:
23
+ set_keys.append(key + "_k")
24
+ else:
25
+ set_keys.append(key + f"_{config.tp_rank}_k")
26
+ set_keys.append(key + f"_{config.tp_rank}_v")
27
+ get_keys = set_keys
28
+ exist_keys = keys
29
+ return set_keys, get_keys, exist_keys
30
+
31
+
32
+ def test_single_operation():
33
+ """Test the set API with a single key-value pair."""
34
+ print("=" * 100)
35
+ print("Testing single operation")
36
+
37
+ buffer_size = 1024 * 1024 * 16 # 16MB
38
+ value_elements = 1024
39
+ store = MooncakeStore()
40
+ buffer = torch.randn(buffer_size, dtype=torch.float32)
41
+ store.register_buffer(buffer)
42
+ value_size = value_elements * buffer.element_size()
43
+
44
+ key = str(uuid.uuid4())
45
+ set_slice = buffer[:value_elements]
46
+ get_slice = buffer[value_elements : 2 * value_elements]
47
+ set_location = set_slice.data_ptr()
48
+ get_location = get_slice.data_ptr()
49
+
50
+ # Test set operation
51
+ result = store.set(key, target_location=set_location, target_sizes=value_size)
52
+ assert result is True, f"❌set operation failed for key: {key}"
53
+
54
+ # Test exists operation
55
+ assert store.exists(key), f"❌key {key} should exist after set operation"
56
+
57
+ # Test get operation
58
+ result = store.get(key, target_location=get_location, target_sizes=value_size)
59
+ assert result is True, f"❌get operation failed for key: {key}"
60
+
61
+ # Compare the data using proper tensor indices
62
+ assert torch.allclose(
63
+ set_slice, get_slice, atol=1e-6
64
+ ), f"❌get operation failed for key: {key}"
65
+
66
+ logger.info(f"✅ Single operation passed")
67
+
68
+
69
+ def test_batch_operation(config: HiCacheStorageConfig):
70
+ """Test the batch set/get APIs with multiple key-value pairs."""
71
+ print("=" * 100)
72
+ print(f"Testing batch operation with config: {config}")
73
+
74
+ buffer_size = 1024 * 1024 * 16 # 16MB
75
+ value_elements = 256
76
+ kv_num = 13
77
+ store = MooncakeStore(config)
78
+ buffer = torch.randn(buffer_size, dtype=torch.float32)
79
+ store.register_buffer(buffer)
80
+ value_size = value_elements * buffer.element_size()
81
+
82
+ set_keys, get_keys, exist_keys = generate_batch_query_keys(kv_num, config)
83
+ set_slices = [
84
+ buffer[i * value_elements : (i + 1) * value_elements]
85
+ for i in range(len(set_keys))
86
+ ]
87
+ set_locations = [set_slice.data_ptr() for set_slice in set_slices]
88
+ target_sizes = [value_size for _ in range(len(set_keys))]
89
+
90
+ # Test batch set operation
91
+ result = store.batch_set(
92
+ set_keys, target_locations=set_locations, target_sizes=target_sizes
93
+ )
94
+ assert result is True, f"❌batch set operation failed"
95
+
96
+ # Test batch exists operation
97
+ assert store.batch_exists(
98
+ exist_keys
99
+ ), f"❌keys should exist after batch set operation"
100
+
101
+ # Test batch get operation
102
+ get_slices = [
103
+ buffer[
104
+ (len(set_keys) + i)
105
+ * value_elements : (len(set_keys) + i + 1)
106
+ * value_elements
107
+ ]
108
+ for i in range(len(get_keys))
109
+ ]
110
+ get_locations = [get_slice.data_ptr() for get_slice in get_slices]
111
+ result = store.batch_get(
112
+ get_keys, target_locations=get_locations, target_sizes=target_sizes
113
+ )
114
+ assert result == kv_num, f"❌batch get operation failed"
115
+ for i in range(len(get_keys)):
116
+ assert torch.allclose(
117
+ set_slices[i], get_slices[i], atol=1e-6
118
+ ), f"❌batch get operation failed for key: {get_keys[i]}"
119
+
120
+ logger.info(f"✅ Batch operation passed")
121
+
122
+
123
+ if __name__ == "__main__":
124
+ test_single_operation()
125
+ test_batch_operation(
126
+ HiCacheStorageConfig(
127
+ is_mla_model=False,
128
+ tp_rank=0,
129
+ tp_size=1,
130
+ model_name=None,
131
+ is_page_first_layout=True,
132
+ )
133
+ )
134
+ test_batch_operation(
135
+ HiCacheStorageConfig(
136
+ is_mla_model=True,
137
+ tp_rank=0,
138
+ tp_size=1,
139
+ model_name=None,
140
+ is_page_first_layout=True,
141
+ )
142
+ )
143
+ test_batch_operation(
144
+ HiCacheStorageConfig(
145
+ is_mla_model=False,
146
+ tp_rank=1,
147
+ tp_size=4,
148
+ model_name=None,
149
+ is_page_first_layout=True,
150
+ )
151
+ )
152
+ test_batch_operation(
153
+ HiCacheStorageConfig(
154
+ is_mla_model=True,
155
+ tp_rank=3,
156
+ tp_size=8,
157
+ model_name=None,
158
+ is_page_first_layout=True,
159
+ )
160
+ )
161
+ logger.info(f"✅ All tests passed")
@@ -18,6 +18,8 @@ from dataclasses import dataclass
18
18
  from enum import Enum
19
19
  from typing import Dict, List, Optional, Union
20
20
 
21
+ from sglang.srt.metrics.utils import generate_buckets
22
+ from sglang.srt.server_args import ServerArgs
21
23
  from sglang.srt.utils import get_bool_env_var
22
24
 
23
25
  SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
@@ -309,6 +311,7 @@ class SchedulerMetricsCollector:
309
311
  class TokenizerMetricsCollector:
310
312
  def __init__(
311
313
  self,
314
+ server_args: ServerArgs,
312
315
  labels: Dict[str, str],
313
316
  bucket_time_to_first_token: Optional[List[float]] = None,
314
317
  bucket_inter_token_latency: Optional[List[float]] = None,
@@ -334,7 +337,7 @@ class TokenizerMetricsCollector:
334
337
  )
335
338
 
336
339
  if collect_tokens_histogram:
337
- bucket_prompt_tokens = [
340
+ default_bucket_prompt_tokens = [
338
341
  100,
339
342
  300,
340
343
  500,
@@ -363,9 +366,11 @@ class TokenizerMetricsCollector:
363
366
  name="sglang:prompt_tokens_histogram",
364
367
  documentation="Histogram of prompt token length.",
365
368
  labelnames=labels.keys(),
366
- buckets=bucket_prompt_tokens,
369
+ buckets=generate_buckets(
370
+ server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
371
+ ),
367
372
  )
368
- bucket_generation_tokens = [
373
+ default_bucket_generation_tokens = [
369
374
  100,
370
375
  300,
371
376
  500,
@@ -390,7 +395,10 @@ class TokenizerMetricsCollector:
390
395
  name="sglang:generation_tokens_histogram",
391
396
  documentation="Histogram of generation token length.",
392
397
  labelnames=labels.keys(),
393
- buckets=bucket_generation_tokens,
398
+ buckets=generate_buckets(
399
+ server_args.generation_tokens_buckets,
400
+ default_bucket_generation_tokens,
401
+ ),
394
402
  )
395
403
 
396
404
  self.cached_tokens_total = Counter(
@@ -0,0 +1,48 @@
1
+ # Copyright 2023-2025 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ """Utilities for Prometheus Metrics."""
15
+ import math
16
+ from typing import List
17
+
18
+
19
+ def two_sides_exponential_buckets(
20
+ middle: float, base: float, count: int
21
+ ) -> List[float]:
22
+ buckets = []
23
+ half_count = math.ceil(count / 2)
24
+ distance = 1
25
+ buckets.append(middle)
26
+ for i in range(half_count):
27
+ distance *= base
28
+ buckets.append(middle + distance)
29
+ buckets.append(max(0, middle - distance))
30
+ return sorted(set(buckets))
31
+
32
+
33
+ def generate_buckets(
34
+ buckets_rule: List[str], default_buckets: List[float]
35
+ ) -> List[float]:
36
+ if not buckets_rule:
37
+ buckets_rule = ["default"]
38
+
39
+ assert len(buckets_rule) > 0
40
+ rule = buckets_rule[0]
41
+ if rule == "tse":
42
+ middle, base, count = buckets_rule[1:]
43
+ assert float(base) > 1.0, "Base must be greater than 1.0"
44
+ return two_sides_exponential_buckets(float(middle), float(base), int(count))
45
+ if rule == "default":
46
+ return sorted(set(default_buckets))
47
+ assert rule == "customer"
48
+ return sorted(set([float(x) for x in buckets_rule[1:]]))
@@ -516,24 +516,23 @@ class ForwardBatch:
516
516
  for batch_idx in range(batch_size):
517
517
  mm_input = batch.multimodal_inputs[batch_idx]
518
518
  if self.forward_mode.is_decode():
519
- mrope_position_deltas = (
520
- [0]
521
- if mm_input is None
522
- else flatten_nested_list(mm_input.mrope_position_delta.tolist())
523
- )
524
- next_input_positions = []
525
- for mrope_position_delta in mrope_position_deltas:
526
- # batched deltas needs to be processed separately
527
- # Convert list of lists to tensor with shape [3, seq_len]
528
- next_input_positions += [
529
- MRotaryEmbedding.get_next_input_positions(
530
- mrope_position_delta,
531
- int(self.seq_lens[batch_idx]) - 1,
532
- int(self.seq_lens[batch_idx]),
533
- )
534
- ]
535
519
  # 3 * N
536
- mrope_positions_list[batch_idx] = torch.cat(next_input_positions, dim=1)
520
+ if mm_input is None:
521
+ mrope_positions_list[batch_idx] = torch.full(
522
+ (3, 1),
523
+ self.seq_lens[batch_idx] - 1,
524
+ dtype=torch.int64,
525
+ device=model_runner.device,
526
+ )
527
+ else:
528
+ mrope_position_deltas = mm_input.mrope_position_delta.flatten().to(
529
+ model_runner.device, non_blocking=True
530
+ )
531
+ mrope_positions_list[batch_idx] = (
532
+ (mrope_position_deltas + self.seq_lens[batch_idx] - 1)
533
+ .unsqueeze(0)
534
+ .repeat(3, 1)
535
+ )
537
536
  elif self.forward_mode.is_extend():
538
537
  extend_seq_len, extend_prefix_len = (
539
538
  batch.extend_seq_lens[batch_idx],
@@ -1655,7 +1655,7 @@ class ModelRunner:
1655
1655
 
1656
1656
  def apply_torch_tp(self):
1657
1657
  logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
1658
- from sglang.srt.model_parallel import tensor_parallel
1658
+ from sglang.srt.layers.model_parallel import tensor_parallel
1659
1659
 
1660
1660
  device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
1661
1661
  tensor_parallel(self.model, device_mesh)