sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +8 -0
  3. sglang/srt/configs/model_config.py +6 -0
  4. sglang/srt/configs/step3_vl.py +172 -0
  5. sglang/srt/conversation.py +23 -0
  6. sglang/srt/disaggregation/decode.py +2 -8
  7. sglang/srt/disaggregation/prefill.py +2 -6
  8. sglang/srt/distributed/parallel_state.py +86 -1
  9. sglang/srt/entrypoints/engine.py +14 -18
  10. sglang/srt/entrypoints/http_server.py +23 -3
  11. sglang/srt/entrypoints/openai/protocol.py +3 -1
  12. sglang/srt/entrypoints/openai/serving_base.py +5 -2
  13. sglang/srt/entrypoints/openai/serving_chat.py +2 -21
  14. sglang/srt/eplb/expert_distribution.py +5 -0
  15. sglang/srt/eplb/expert_location.py +17 -6
  16. sglang/srt/eplb/expert_location_dispatch.py +1 -0
  17. sglang/srt/eplb/expert_location_updater.py +2 -0
  18. sglang/srt/function_call/function_call_parser.py +2 -0
  19. sglang/srt/function_call/step3_detector.py +436 -0
  20. sglang/srt/hf_transformers_utils.py +2 -0
  21. sglang/srt/jinja_template_utils.py +4 -1
  22. sglang/srt/layers/moe/cutlass_moe.py +2 -1
  23. sglang/srt/layers/moe/ep_moe/layer.py +98 -603
  24. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
  25. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
  28. sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
  29. sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
  30. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
  31. sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
  32. sglang/srt/layers/moe/topk.py +6 -2
  33. sglang/srt/layers/quantization/fp8.py +0 -18
  34. sglang/srt/layers/quantization/modelopt_quant.py +2 -0
  35. sglang/srt/layers/quantization/unquant.py +0 -8
  36. sglang/srt/layers/quantization/w4afp8.py +1 -0
  37. sglang/srt/managers/cache_controller.py +143 -45
  38. sglang/srt/managers/data_parallel_controller.py +6 -0
  39. sglang/srt/managers/io_struct.py +12 -2
  40. sglang/srt/managers/scheduler.py +116 -669
  41. sglang/srt/managers/scheduler_input_blocker.py +106 -0
  42. sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
  43. sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
  44. sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
  45. sglang/srt/managers/template_manager.py +62 -19
  46. sglang/srt/managers/tokenizer_manager.py +166 -83
  47. sglang/srt/managers/tp_worker.py +9 -0
  48. sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
  49. sglang/srt/mem_cache/hicache_storage.py +45 -11
  50. sglang/srt/mem_cache/hiradix_cache.py +15 -4
  51. sglang/srt/mem_cache/memory_pool_host.py +73 -1
  52. sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
  53. sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
  54. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
  55. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
  56. sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
  57. sglang/srt/model_executor/model_runner.py +20 -13
  58. sglang/srt/models/arcee.py +532 -0
  59. sglang/srt/models/deepseek_v2.py +15 -56
  60. sglang/srt/models/glm4_moe.py +3 -1
  61. sglang/srt/models/granitemoe.py +3 -0
  62. sglang/srt/models/grok.py +3 -0
  63. sglang/srt/models/hunyuan.py +1 -0
  64. sglang/srt/models/llama4.py +3 -0
  65. sglang/srt/models/mixtral.py +3 -0
  66. sglang/srt/models/olmoe.py +3 -0
  67. sglang/srt/models/phimoe.py +1 -0
  68. sglang/srt/models/qwen3_moe.py +12 -69
  69. sglang/srt/models/step3_vl.py +994 -0
  70. sglang/srt/multimodal/processors/base_processor.py +15 -16
  71. sglang/srt/multimodal/processors/step3_vl.py +515 -0
  72. sglang/srt/poll_based_barrier.py +31 -0
  73. sglang/srt/reasoning_parser.py +2 -1
  74. sglang/srt/server_args.py +18 -13
  75. sglang/srt/speculative/eagle_worker.py +2 -0
  76. sglang/srt/two_batch_overlap.py +8 -3
  77. sglang/test/test_utils.py +53 -0
  78. sglang/utils.py +0 -11
  79. sglang/version.py +1 -1
  80. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
  81. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
  82. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
  83. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
  84. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,142 @@
1
+ import logging
2
+ from typing import Tuple
3
+
4
+ import torch
5
+
6
+ from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGHTS
7
+ from sglang.srt.managers.io_struct import (
8
+ GetWeightsByNameReqInput,
9
+ GetWeightsByNameReqOutput,
10
+ InitWeightsUpdateGroupReqInput,
11
+ InitWeightsUpdateGroupReqOutput,
12
+ ReleaseMemoryOccupationReqInput,
13
+ ReleaseMemoryOccupationReqOutput,
14
+ ResumeMemoryOccupationReqInput,
15
+ ResumeMemoryOccupationReqOutput,
16
+ UpdateWeightFromDiskReqInput,
17
+ UpdateWeightFromDiskReqOutput,
18
+ UpdateWeightsFromDistributedReqInput,
19
+ UpdateWeightsFromDistributedReqOutput,
20
+ UpdateWeightsFromTensorReqInput,
21
+ UpdateWeightsFromTensorReqOutput,
22
+ )
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class SchedulerUpdateWeightsMixin:
28
+
29
+ def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
30
+ """In-place update of the weights from disk."""
31
+ success, message = self.tp_worker.update_weights_from_disk(recv_req)
32
+ if success:
33
+ flush_cache_success = self.flush_cache()
34
+ assert flush_cache_success, "Cache flush failed after updating weights"
35
+ else:
36
+ logger.error(message)
37
+ return UpdateWeightFromDiskReqOutput(success, message, 0)
38
+
39
+ def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
40
+ """Initialize the online model parameter update group."""
41
+ success, message = self.tp_worker.init_weights_update_group(recv_req)
42
+ return InitWeightsUpdateGroupReqOutput(success, message)
43
+
44
+ def update_weights_from_distributed(
45
+ self,
46
+ recv_req: UpdateWeightsFromDistributedReqInput,
47
+ ) -> Tuple[bool, str]:
48
+ """Update the online model parameter."""
49
+ success, message = self.tp_worker.update_weights_from_distributed(recv_req)
50
+ if success:
51
+ if recv_req.flush_cache:
52
+ flush_cache_success = self.flush_cache()
53
+ assert flush_cache_success, "Cache flush failed after updating weights"
54
+ else:
55
+ logger.error(message)
56
+ return UpdateWeightsFromDistributedReqOutput(success, message)
57
+
58
+ def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
59
+ """Update the online model parameter from tensors."""
60
+ success, message = self.tp_worker.update_weights_from_tensor(recv_req)
61
+ # TODO extract common code b/t update_weights_from_distributed and update_weights_from_tensor later
62
+ if success:
63
+ if recv_req.flush_cache:
64
+ flush_cache_success = self.flush_cache()
65
+ assert flush_cache_success, "Cache flush failed after updating weights"
66
+ else:
67
+ logger.error(message)
68
+ torch.distributed.barrier(group=self.tp_cpu_group)
69
+ return UpdateWeightsFromTensorReqOutput(success, message)
70
+
71
+ def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
72
+ parameter = self.tp_worker.get_weights_by_name(recv_req)
73
+ return GetWeightsByNameReqOutput(parameter)
74
+
75
+ def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput):
76
+ tags = recv_req.tags
77
+
78
+ if tags is None or len(tags) == 0:
79
+ tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
80
+
81
+ if GPU_MEMORY_TYPE_KV_CACHE in tags:
82
+ self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_KV_CACHE)
83
+ self.flush_cache()
84
+
85
+ if GPU_MEMORY_TYPE_WEIGHTS in tags:
86
+ self.stashed_model_static_state = _export_static_state(
87
+ self.tp_worker.worker.model_runner.model
88
+ )
89
+ torch.distributed.barrier(self.tp_cpu_group)
90
+ self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)
91
+
92
+ return ReleaseMemoryOccupationReqOutput()
93
+
94
+ def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput):
95
+ tags = recv_req.tags
96
+
97
+ if tags is None or len(tags) == 0:
98
+ tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
99
+
100
+ if GPU_MEMORY_TYPE_WEIGHTS in tags:
101
+ self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_WEIGHTS)
102
+ torch.distributed.barrier(self.tp_cpu_group)
103
+ _import_static_state(
104
+ self.tp_worker.worker.model_runner.model,
105
+ self.stashed_model_static_state,
106
+ )
107
+ del self.stashed_model_static_state
108
+
109
+ if GPU_MEMORY_TYPE_KV_CACHE in tags:
110
+ self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_KV_CACHE)
111
+
112
+ return ResumeMemoryOccupationReqOutput()
113
+
114
+ def save_remote_model(self, params):
115
+ url = params["url"]
116
+
117
+ worker = self.tp_worker.worker
118
+
119
+ worker.model_runner.save_remote_model(url)
120
+
121
+ def save_sharded_model(self, params):
122
+ worker = self.tp_worker.worker
123
+
124
+ worker.model_runner.save_sharded_model(
125
+ path=params["path"],
126
+ pattern=params["pattern"],
127
+ max_size=params["max_size"],
128
+ )
129
+
130
+
131
+ def _export_static_state(model):
132
+ return dict(
133
+ buffers=[
134
+ (name, buffer.detach().clone()) for name, buffer in model.named_buffers()
135
+ ]
136
+ )
137
+
138
+
139
+ def _import_static_state(model, static_params):
140
+ self_named_buffers = dict(model.named_buffers())
141
+ for name, tensor in static_params["buffers"]:
142
+ self_named_buffers[name][...] = tensor
@@ -53,7 +53,7 @@ class TemplateManager:
53
53
  def __init__(self):
54
54
  self._chat_template_name: Optional[str] = None
55
55
  self._completion_template_name: Optional[str] = None
56
- self._jinja_template_content_format: Optional[str] = None
56
+ self._jinja_template_content_format: Optional[str] = "openai"
57
57
 
58
58
  @property
59
59
  def chat_template_name(self) -> Optional[str]:
@@ -71,31 +71,60 @@ class TemplateManager:
71
71
  return self._jinja_template_content_format
72
72
 
73
73
  def load_chat_template(
74
- self, tokenizer_manager, chat_template_arg: str, model_path: str
74
+ self, tokenizer_manager, chat_template_arg: Optional[str], model_path: str
75
75
  ) -> None:
76
76
  """
77
77
  Load a chat template from various sources.
78
78
 
79
79
  Args:
80
80
  tokenizer_manager: The tokenizer manager instance
81
- chat_template_arg: Template name or file path
81
+ chat_template_arg: Template name, file path, or None to auto-detect
82
82
  model_path: Path to the model
83
83
  """
84
- logger.info(f"Loading chat template: {chat_template_arg}")
84
+ if chat_template_arg:
85
+ self._load_explicit_chat_template(tokenizer_manager, chat_template_arg)
86
+ else:
87
+ # Try HuggingFace template first
88
+ hf_template = self._resolve_hf_chat_template(tokenizer_manager)
89
+ if hf_template:
90
+ self._jinja_template_content_format = (
91
+ detect_jinja_template_content_format(hf_template)
92
+ )
93
+ logger.info(
94
+ f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
95
+ )
96
+ return
85
97
 
86
- if not chat_template_exists(chat_template_arg):
87
- if not os.path.exists(chat_template_arg):
88
- raise RuntimeError(
89
- f"Chat template {chat_template_arg} is not a built-in template name "
90
- "or a valid chat template file path."
98
+ # Fallback to SGLang template guessing
99
+ self.guess_chat_template_from_model_path(model_path)
100
+
101
+ # Set default format if no template was found
102
+ if self._chat_template_name is None:
103
+ self._jinja_template_content_format = "string"
104
+ logger.info(
105
+ "No chat template found, defaulting to 'string' content format"
91
106
  )
92
107
 
93
- if chat_template_arg.endswith(".jinja"):
94
- self._load_jinja_template(tokenizer_manager, chat_template_arg)
95
- else:
96
- self._load_json_chat_template(chat_template_arg)
97
- else:
108
+ def _load_explicit_chat_template(
109
+ self, tokenizer_manager, chat_template_arg: str
110
+ ) -> None:
111
+ """Load explicitly specified chat template."""
112
+ logger.info(f"Loading chat template from argument: {chat_template_arg}")
113
+
114
+ if chat_template_exists(chat_template_arg):
98
115
  self._chat_template_name = chat_template_arg
116
+ return
117
+
118
+ if not os.path.exists(chat_template_arg):
119
+ raise RuntimeError(
120
+ f"Chat template {chat_template_arg} is not a built-in template name "
121
+ "or a valid chat template file path."
122
+ )
123
+
124
+ if chat_template_arg.endswith(".jinja"):
125
+ self._load_jinja_template(tokenizer_manager, chat_template_arg)
126
+ else:
127
+ self._load_json_chat_template(chat_template_arg)
99
128
 
100
129
  def guess_chat_template_from_model_path(self, model_path: str) -> None:
101
130
  """
@@ -146,10 +175,7 @@ class TemplateManager:
146
175
  completion_template: Optional completion template name/path
147
176
  """
148
177
  # Load chat template
149
- if chat_template:
150
- self.load_chat_template(tokenizer_manager, chat_template, model_path)
151
- else:
152
- self.guess_chat_template_from_model_path(model_path)
178
+ self.load_chat_template(tokenizer_manager, chat_template, model_path)
153
179
 
154
180
  # Load completion template
155
181
  if completion_template:
@@ -166,7 +192,7 @@ class TemplateManager:
166
192
  chat_template
167
193
  )
168
194
  logger.info(
169
- f"Detected chat template content format: {self._jinja_template_content_format}"
195
+ f"Detected user specified Jinja chat template with content format: {self._jinja_template_content_format}"
170
196
  )
171
197
 
172
198
  def _load_json_chat_template(self, template_path: str) -> None:
@@ -224,3 +250,20 @@ class TemplateManager:
224
250
  override=True,
225
251
  )
226
252
  self._completion_template_name = template["name"]
253
+
254
+ def _resolve_hf_chat_template(self, tokenizer_manager) -> Optional[str]:
255
+ """
256
+ Resolve HuggingFace chat template.
257
+
258
+ Returns the chat template string if found, None otherwise.
259
+ """
260
+ tokenizer = tokenizer_manager.tokenizer
261
+
262
+ # Try to get AutoTokenizer chat template
263
+ try:
264
+ return tokenizer.get_chat_template()
265
+ except Exception as e:
266
+ logger.debug(f"Error getting chat template via get_chat_template(): {e}")
267
+
268
+ logger.debug("No HuggingFace chat template found")
269
+ return None