sglang 0.5.3.post1__py3-none-any.whl → 0.5.3.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (216) hide show
  1. sglang/bench_one_batch.py +12 -2
  2. sglang/bench_one_batch_server.py +40 -25
  3. sglang/bench_serving.py +110 -21
  4. sglang/compile_deep_gemm.py +3 -2
  5. sglang/global_config.py +1 -25
  6. sglang/lang/api.py +6 -0
  7. sglang/lang/interpreter.py +1 -0
  8. sglang/lang/ir.py +13 -0
  9. sglang/launch_server.py +9 -2
  10. sglang/profiler.py +18 -1
  11. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  12. sglang/srt/compilation/backend.py +431 -0
  13. sglang/srt/compilation/compilation_config.py +19 -0
  14. sglang/srt/compilation/compilation_counter.py +47 -0
  15. sglang/srt/compilation/compile.py +210 -0
  16. sglang/srt/compilation/compiler_interface.py +477 -0
  17. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  18. sglang/srt/compilation/fix_functionalization.py +134 -0
  19. sglang/srt/compilation/fx_utils.py +83 -0
  20. sglang/srt/compilation/inductor_pass.py +140 -0
  21. sglang/srt/compilation/pass_manager.py +66 -0
  22. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  23. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  24. sglang/srt/configs/load_config.py +3 -2
  25. sglang/srt/configs/mamba_utils.py +1 -1
  26. sglang/srt/configs/model_config.py +17 -3
  27. sglang/srt/configs/qwen3_next.py +0 -3
  28. sglang/srt/constrained/base_grammar_backend.py +5 -1
  29. sglang/srt/constrained/llguidance_backend.py +3 -0
  30. sglang/srt/constrained/outlines_backend.py +1 -1
  31. sglang/srt/constrained/xgrammar_backend.py +5 -1
  32. sglang/srt/disaggregation/decode.py +12 -10
  33. sglang/srt/disaggregation/prefill.py +6 -4
  34. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  35. sglang/srt/distributed/device_communicators/custom_all_reduce.py +2 -2
  36. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  37. sglang/srt/distributed/parallel_state.py +37 -5
  38. sglang/srt/entrypoints/context.py +3 -1
  39. sglang/srt/entrypoints/engine.py +1 -1
  40. sglang/srt/entrypoints/grpc_server.py +172 -28
  41. sglang/srt/entrypoints/harmony_utils.py +2 -1
  42. sglang/srt/entrypoints/http_server.py +2 -1
  43. sglang/srt/entrypoints/openai/protocol.py +3 -0
  44. sglang/srt/entrypoints/openai/serving_base.py +2 -1
  45. sglang/srt/entrypoints/openai/serving_chat.py +3 -2
  46. sglang/srt/entrypoints/openai/serving_completions.py +1 -0
  47. sglang/srt/entrypoints/openai/serving_responses.py +1 -1
  48. sglang/srt/environ.py +10 -0
  49. sglang/srt/eplb/expert_distribution.py +3 -4
  50. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  51. sglang/srt/eplb/expert_location_updater.py +2 -2
  52. sglang/srt/function_call/base_format_detector.py +17 -18
  53. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  54. sglang/srt/function_call/utils.py +2 -1
  55. sglang/srt/grpc/compile_proto.py +2 -2
  56. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +89 -25
  57. sglang/srt/grpc/sglang_scheduler_pb2.py +38 -38
  58. sglang/srt/grpc/sglang_scheduler_pb2.pyi +2 -4
  59. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +1 -1
  60. sglang/srt/layers/attention/aiter_backend.py +3 -3
  61. sglang/srt/layers/attention/attention_registry.py +2 -0
  62. sglang/srt/layers/attention/base_attn_backend.py +19 -0
  63. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  64. sglang/srt/layers/attention/fla/utils.py +0 -3
  65. sglang/srt/layers/attention/flashattention_backend.py +12 -7
  66. sglang/srt/layers/attention/flashinfer_backend.py +18 -15
  67. sglang/srt/layers/attention/flashinfer_mla_backend.py +9 -9
  68. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  69. sglang/srt/layers/attention/nsa/nsa_indexer.py +10 -4
  70. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  71. sglang/srt/layers/attention/triton_backend.py +71 -32
  72. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  73. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  74. sglang/srt/layers/attention/trtllm_mla_backend.py +5 -5
  75. sglang/srt/layers/attention/vision.py +3 -3
  76. sglang/srt/layers/communicator.py +8 -5
  77. sglang/srt/layers/layernorm.py +10 -5
  78. sglang/srt/layers/logits_processor.py +13 -13
  79. sglang/srt/layers/moe/cutlass_w4a8_moe.py +196 -0
  80. sglang/srt/layers/moe/ep_moe/layer.py +24 -4
  81. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  82. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  83. sglang/srt/layers/moe/fused_moe_triton/layer.py +0 -2
  84. sglang/srt/layers/moe/router.py +51 -15
  85. sglang/srt/layers/moe/token_dispatcher/__init__.py +8 -0
  86. sglang/srt/layers/moe/token_dispatcher/deepep.py +13 -4
  87. sglang/srt/layers/moe/token_dispatcher/mooncake.py +394 -0
  88. sglang/srt/layers/moe/utils.py +8 -0
  89. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -2
  90. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +3 -5
  91. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +3 -2
  92. sglang/srt/layers/quantization/fp8_kernel.py +35 -8
  93. sglang/srt/layers/quantization/fp8_utils.py +1 -3
  94. sglang/srt/layers/quantization/int8_kernel.py +19 -3
  95. sglang/srt/layers/quantization/modelopt_quant.py +9 -19
  96. sglang/srt/layers/quantization/mxfp4.py +4 -4
  97. sglang/srt/layers/quantization/w4afp8.py +47 -1
  98. sglang/srt/layers/radix_attention.py +59 -9
  99. sglang/srt/layers/rotary_embedding.py +33 -9
  100. sglang/srt/layers/sampler.py +33 -13
  101. sglang/srt/lora/eviction_policy.py +139 -0
  102. sglang/srt/lora/lora_manager.py +23 -0
  103. sglang/srt/lora/lora_registry.py +1 -1
  104. sglang/srt/lora/mem_pool.py +40 -16
  105. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  106. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  107. sglang/srt/managers/cache_controller.py +45 -13
  108. sglang/srt/managers/data_parallel_controller.py +123 -27
  109. sglang/srt/managers/detokenizer_manager.py +3 -0
  110. sglang/srt/managers/io_struct.py +43 -3
  111. sglang/srt/managers/mm_utils.py +2 -2
  112. sglang/srt/managers/multi_tokenizer_mixin.py +17 -0
  113. sglang/srt/managers/overlap_utils.py +82 -7
  114. sglang/srt/managers/schedule_batch.py +180 -469
  115. sglang/srt/managers/schedule_policy.py +12 -0
  116. sglang/srt/managers/scheduler.py +248 -142
  117. sglang/srt/managers/scheduler_metrics_mixin.py +51 -2
  118. sglang/srt/managers/scheduler_output_processor_mixin.py +97 -10
  119. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  120. sglang/srt/managers/scheduler_update_weights_mixin.py +19 -14
  121. sglang/srt/managers/tokenizer_communicator_mixin.py +2 -0
  122. sglang/srt/managers/tokenizer_manager.py +45 -7
  123. sglang/srt/managers/tp_worker.py +30 -12
  124. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  125. sglang/srt/mem_cache/chunk_cache.py +5 -1
  126. sglang/srt/mem_cache/common.py +475 -0
  127. sglang/srt/mem_cache/hicache_storage.py +4 -1
  128. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  129. sglang/srt/mem_cache/mamba_radix_cache.py +995 -0
  130. sglang/srt/mem_cache/memory_pool.py +38 -29
  131. sglang/srt/mem_cache/radix_cache.py +91 -17
  132. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  133. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  134. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  135. sglang/srt/mem_cache/storage/eic/eic_storage.py +3 -1
  136. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  137. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +4 -2
  138. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +44 -17
  139. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +37 -7
  140. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +16 -1
  141. sglang/srt/mem_cache/swa_radix_cache.py +25 -15
  142. sglang/srt/metrics/collector.py +18 -0
  143. sglang/srt/model_executor/cuda_graph_runner.py +3 -2
  144. sglang/srt/model_executor/forward_batch_info.py +10 -4
  145. sglang/srt/model_executor/model_runner.py +210 -66
  146. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +527 -0
  147. sglang/srt/model_loader/loader.py +114 -39
  148. sglang/srt/models/apertus.py +2 -3
  149. sglang/srt/models/arcee.py +2 -2
  150. sglang/srt/models/bailing_moe.py +8 -8
  151. sglang/srt/models/bailing_moe_nextn.py +3 -4
  152. sglang/srt/models/deepseek_nextn.py +2 -2
  153. sglang/srt/models/deepseek_v2.py +49 -32
  154. sglang/srt/models/dots_vlm_vit.py +1 -1
  155. sglang/srt/models/falcon_h1.py +2 -9
  156. sglang/srt/models/glm4_moe.py +8 -12
  157. sglang/srt/models/glm4_moe_nextn.py +2 -2
  158. sglang/srt/models/glm4v.py +1 -1
  159. sglang/srt/models/glm4v_moe.py +5 -5
  160. sglang/srt/models/gpt_oss.py +4 -4
  161. sglang/srt/models/grok.py +5 -10
  162. sglang/srt/models/kimi_vl.py +1 -7
  163. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  164. sglang/srt/models/llama.py +2 -2
  165. sglang/srt/models/longcat_flash.py +3 -7
  166. sglang/srt/models/minicpmo.py +7 -2
  167. sglang/srt/models/mllama4.py +2 -2
  168. sglang/srt/models/qwen2_5_vl.py +1 -1
  169. sglang/srt/models/qwen2_moe.py +4 -4
  170. sglang/srt/models/qwen2_vl.py +1 -1
  171. sglang/srt/models/qwen3_moe.py +4 -4
  172. sglang/srt/models/qwen3_next.py +2 -2
  173. sglang/srt/models/qwen3_next_mtp.py +3 -4
  174. sglang/srt/models/qwen3_vl.py +9 -10
  175. sglang/srt/models/qwen3_vl_moe.py +6 -15
  176. sglang/srt/models/step3_vl.py +2 -3
  177. sglang/srt/sampling/custom_logit_processor.py +2 -1
  178. sglang/srt/sampling/sampling_batch_info.py +6 -13
  179. sglang/srt/sampling/sampling_params.py +70 -2
  180. sglang/srt/server_args.py +245 -31
  181. sglang/srt/single_batch_overlap.py +0 -1
  182. sglang/srt/speculative/draft_utils.py +210 -0
  183. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -6
  184. sglang/srt/speculative/eagle_info.py +53 -17
  185. sglang/srt/speculative/eagle_info_v2.py +404 -0
  186. sglang/srt/speculative/eagle_utils.py +138 -0
  187. sglang/srt/speculative/eagle_worker.py +55 -223
  188. sglang/srt/speculative/eagle_worker_v2.py +484 -0
  189. sglang/srt/speculative/ngram_info.py +14 -9
  190. sglang/srt/speculative/spec_utils.py +1 -1
  191. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  192. sglang/srt/two_batch_overlap.py +22 -13
  193. sglang/srt/utils/__init__.py +1 -1
  194. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  195. sglang/srt/utils/common.py +77 -24
  196. sglang/srt/utils/hf_transformers_utils.py +2 -1
  197. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  198. sglang/srt/utils/profile_merger.py +199 -0
  199. sglang/test/run_eval.py +1 -0
  200. sglang/test/runners.py +2 -0
  201. sglang/test/simple_eval_common.py +3 -0
  202. sglang/test/simple_eval_longbench_v2.py +33 -21
  203. sglang/test/test_cutlass_moe.py +1 -1
  204. sglang/test/test_deterministic.py +2 -5
  205. sglang/test/test_deterministic_utils.py +3 -3
  206. sglang/test/test_utils.py +5 -6
  207. sglang/version.py +1 -1
  208. {sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/METADATA +23 -11
  209. {sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/RECORD +215 -192
  210. sglang/srt/speculative/build_eagle_tree.py +0 -427
  211. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  212. /sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -0
  213. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  214. {sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/WHEEL +0 -0
  215. {sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/licenses/LICENSE +0 -0
  216. {sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -51,6 +51,7 @@ import logging
51
51
  import multiprocessing
52
52
  import os
53
53
  import time
54
+ from types import SimpleNamespace
54
55
  from typing import Tuple
55
56
 
56
57
  import numpy as np
@@ -257,11 +258,18 @@ def prepare_synthetic_inputs_for_latency_test(
257
258
 
258
259
  @torch.no_grad
259
260
  def extend(reqs, model_runner):
261
+ # Create dummy tree_cache for benchmarks (no prefix caching, just allocation)
262
+ dummy_tree_cache = SimpleNamespace(
263
+ page_size=1,
264
+ device=model_runner.device,
265
+ token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
266
+ )
267
+
260
268
  batch = ScheduleBatch.init_new(
261
269
  reqs=reqs,
262
270
  req_to_token_pool=model_runner.req_to_token_pool,
263
271
  token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
264
- tree_cache=None,
272
+ tree_cache=dummy_tree_cache,
265
273
  model_config=model_runner.model_config,
266
274
  enable_overlap=False,
267
275
  spec_algorithm=SpeculativeAlgorithm.NONE,
@@ -510,7 +518,9 @@ def latency_test(
510
518
 
511
519
  # Set CPU affinity
512
520
  if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
513
- set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
521
+ set_gpu_proc_affinity(
522
+ server_args.pp_size, server_args.tp_size, server_args.nnodes, tp_rank
523
+ )
514
524
 
515
525
  # Configure the logger
516
526
  configure_logger(server_args, prefix=f" TP{tp_rank}")
@@ -25,8 +25,10 @@ from typing import List, Optional, Tuple
25
25
  import numpy as np
26
26
  import requests
27
27
  from pydantic import BaseModel
28
+ from transformers import AutoProcessor, PreTrainedTokenizer
28
29
 
29
30
  from sglang.bench_serving import (
31
+ get_processor,
30
32
  get_tokenizer,
31
33
  sample_mmmu_requests,
32
34
  sample_random_requests,
@@ -104,8 +106,14 @@ Note: To view the traces through perfetto-ui, please:
104
106
  if self.profile_links.extend or self.profile_links.decode:
105
107
  # Create a combined link or use the first available one
106
108
  trace_files = [self.profile_links.extend, self.profile_links.decode]
109
+ if any(trace_file is None for trace_file in trace_files):
110
+ logger.error("Some trace files are None", f"{trace_files=}")
107
111
  trace_files_relay_links = [
108
- f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
112
+ (
113
+ f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
114
+ if trace_file
115
+ else "N/A"
116
+ )
109
117
  for trace_file in trace_files
110
118
  ]
111
119
 
@@ -114,30 +122,31 @@ Note: To view the traces through perfetto-ui, please:
114
122
  # Build the row
115
123
  return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
116
124
 
117
- @classmethod
118
- def generate_markdown_report(
119
- cls, trace_dir, results: List["BenchmarkResult"]
120
- ) -> str:
121
- """Generate a markdown report from a list of BenchmarkResult object from a single run."""
122
- import os
123
125
 
124
- summary = f"### {results[0].model_path}\n"
126
+ def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
127
+ """Generate a markdown report from a list of BenchmarkResult object from a single run."""
128
+ import os
129
+
130
+ summary = f"### {results[0].model_path}\n"
125
131
 
126
- # summary += (
127
- # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
128
- # )
129
- summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
130
- summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
132
+ # summary += (
133
+ # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
134
+ # )
135
+ summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
136
+ summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
131
137
 
132
- # all results should share the same isl & osl
133
- for result in results:
134
- base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
135
- relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
136
- relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
137
- # base_url = "https://github.com/sgl-project/ci-data/traces"
138
- summary += result.to_markdown_row(trace_dir, base_url, relay_base)
138
+ # all results should share the same isl & osl
139
+ for result in results:
140
+ base_url = os.getenv(
141
+ "TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces"
142
+ ).rstrip("/")
143
+ relay_base = os.getenv(
144
+ "PERFETTO_RELAY_URL",
145
+ "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html",
146
+ ).rstrip("/")
147
+ summary += result.to_markdown_row(trace_dir, base_url, relay_base)
139
148
 
140
- return summary
149
+ return summary
141
150
 
142
151
 
143
152
  @dataclasses.dataclass
@@ -288,7 +297,7 @@ def run_one_case(
288
297
  input_len_step_percentage: float,
289
298
  run_name: str,
290
299
  result_filename: str,
291
- tokenizer,
300
+ tokenizer: PreTrainedTokenizer | AutoProcessor,
292
301
  dataset_name="",
293
302
  profile: bool = False,
294
303
  profile_steps: int = 3,
@@ -302,9 +311,8 @@ def run_one_case(
302
311
  if dataset_name == "mmmu":
303
312
  input_requests = sample_mmmu_requests(
304
313
  num_requests=batch_size,
305
- tokenizer=tokenizer,
314
+ processor=tokenizer,
306
315
  fixed_output_len=output_len,
307
- apply_chat_template=True,
308
316
  random_sample=False,
309
317
  )
310
318
  elif dataset_name == "random":
@@ -364,6 +372,8 @@ def run_one_case(
364
372
  if dataset_name == "mmmu":
365
373
  # vlm
366
374
  input_ids = []
375
+ # for vlms, tokenizer is an instance of AutoProcessor
376
+ tokenizer = tokenizer.tokenizer
367
377
  for input_req in input_requests:
368
378
  input_ids += [tokenizer.encode(input_req.prompt)]
369
379
  payload["image_data"] = [req.image_data for req in input_requests]
@@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
609
619
  tokenizer_path = server_info["tokenizer_path"]
610
620
  elif "prefill" in server_info:
611
621
  tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
612
- tokenizer = get_tokenizer(tokenizer_path)
622
+
623
+ if bench_args.dataset_name == "mmmu":
624
+ # mmmu implies this is a MLLM
625
+ tokenizer = get_processor(tokenizer_path)
626
+ else:
627
+ tokenizer = get_tokenizer(tokenizer_path)
613
628
 
614
629
  # warmup
615
630
  if not bench_args.skip_warmup:
sglang/bench_serving.py CHANGED
@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
12
12
 
13
13
  import argparse
14
14
  import asyncio
15
- import base64
16
15
  import io
17
16
  import json
18
17
  import os
@@ -623,6 +622,48 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
623
622
  return output
624
623
 
625
624
 
625
+ def _build_profile_urls(
626
+ profile_prefill_url: Optional[List[str]],
627
+ profile_decode_url: Optional[List[str]],
628
+ ) -> List[Tuple[str, str]]:
629
+ """Build profile URLs list from prefill/decode URL arguments.
630
+
631
+ Returns:
632
+ List of (worker_type, url) tuples. e.g., [("Prefill-0", "http://..."), ("Decode-0", "http://...")]
633
+ """
634
+ profile_urls = []
635
+ if profile_prefill_url:
636
+ for idx, url in enumerate(profile_prefill_url):
637
+ profile_urls.append((f"Prefill-{idx}", url))
638
+ if profile_decode_url:
639
+ for idx, url in enumerate(profile_decode_url):
640
+ profile_urls.append((f"Decode-{idx}", url))
641
+ return profile_urls
642
+
643
+
644
+ async def _call_profile_pd(profile_urls: List[Tuple[str, str]], mode: str) -> None:
645
+ """Call profile endpoint (start/stop) on PD separated workers.
646
+
647
+ Args:
648
+ profile_urls: List of (worker_type, url) tuples
649
+ mode: "start" or "stop"
650
+ """
651
+ endpoint = "/start_profile" if mode == "start" else "/stop_profile"
652
+ action = "Starting" if mode == "start" else "Stopping"
653
+ action_past = "started" if mode == "start" else "stopped"
654
+
655
+ print(f"{action} profiler...")
656
+
657
+ for worker_type, url in profile_urls:
658
+ profile_output = await async_request_profile(api_url=url + endpoint)
659
+ if profile_output.success:
660
+ print(f"Profiler {action_past} for {worker_type} worker at {url}")
661
+ else:
662
+ print(
663
+ f"Failed to {mode} profiler for {worker_type} worker at {url}: {profile_output.error}"
664
+ )
665
+
666
+
626
667
  def get_model(pretrained_model_name_or_path: str) -> str:
627
668
  if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
628
669
  import huggingface_hub.constants
@@ -671,7 +712,7 @@ def get_processor(
671
712
  if pretrained_model_name_or_path.endswith(
672
713
  ".json"
673
714
  ) or pretrained_model_name_or_path.endswith(".model"):
674
- from sglang.srt.hf_transformers_utils import get_processor
715
+ from sglang.srt.utils.hf_transformers_utils import get_processor
675
716
 
676
717
  return get_processor(pretrained_model_name_or_path)
677
718
 
@@ -935,7 +976,7 @@ async def get_mooncake_request_over_time(
935
976
  for i in range(num_rounds):
936
977
  # Add user query for the current round
937
978
  chat_history.append(
938
- {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
979
+ {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
939
980
  )
940
981
 
941
982
  # Form the full prompt from history
@@ -964,7 +1005,7 @@ async def get_mooncake_request_over_time(
964
1005
 
965
1006
  def sample_mmmu_requests(
966
1007
  num_requests: int,
967
- processor: AutoProcessor,
1008
+ processor: AutoProcessor | AutoTokenizer,
968
1009
  fixed_output_len: Optional[int] = None,
969
1010
  random_sample: bool = True,
970
1011
  ) -> List[DatasetRow]:
@@ -973,9 +1014,7 @@ def sample_mmmu_requests(
973
1014
 
974
1015
  Args:
975
1016
  num_requests: Number of requests to sample.
976
- tokenizer: Tokenizer to use for token counting.
977
1017
  fixed_output_len: If provided, use this fixed output length for all requests.
978
- apply_chat_template: Whether to apply the chat template to the prompt.
979
1018
  random_sample: Whether to randomly sample or take the first N.
980
1019
 
981
1020
  Returns:
@@ -1282,11 +1321,11 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
1282
1321
  )
1283
1322
 
1284
1323
 
1285
- def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
1324
+ def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
1286
1325
  try:
1287
1326
  content_items = [
1288
- {"type": "image_url", "image_url": {"url": img_url}}
1289
- for img_url in images_base64
1327
+ {"type": "image", "image": {"url": image_base64}}
1328
+ for image_base64 in images_base64
1290
1329
  ]
1291
1330
  content_items.append({"type": "text", "text": text_prompt})
1292
1331
  prompt_str = processor.apply_chat_template(
@@ -1294,7 +1333,9 @@ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor
1294
1333
  add_generation_prompt=True,
1295
1334
  tokenize=False,
1296
1335
  )
1297
- except Exception:
1336
+ except Exception as e:
1337
+ # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
1338
+ print(f"Error applying chat template: {e}, fallback to <image> tag")
1298
1339
  # Some tokenizers do not support list content; fall back to a placeholder in the text
1299
1340
  prompt_str = f"<image>{text_prompt}"
1300
1341
 
@@ -1425,7 +1466,7 @@ def sample_image_requests(
1425
1466
  print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1426
1467
  print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1427
1468
  print(
1428
- f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
1469
+ f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
1429
1470
  )
1430
1471
  return dataset
1431
1472
 
@@ -1676,6 +1717,8 @@ async def benchmark(
1676
1717
  use_trace_timestamps: bool = False,
1677
1718
  mooncake_slowdown_factor=1.0,
1678
1719
  mooncake_num_rounds=1,
1720
+ profile_prefill_url: Optional[List[str]] = None,
1721
+ profile_decode_url: Optional[List[str]] = None,
1679
1722
  ):
1680
1723
  if backend in ASYNC_REQUEST_FUNCS:
1681
1724
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -1765,14 +1808,28 @@ async def benchmark(
1765
1808
 
1766
1809
  time.sleep(1.0)
1767
1810
 
1811
+ # Build profile URLs for PD separated mode (do this once at the beginning)
1812
+ pd_profile_urls = []
1813
+ if profile and pd_separated:
1814
+ pd_profile_urls = _build_profile_urls(profile_prefill_url, profile_decode_url)
1815
+ if not pd_profile_urls:
1816
+ print(
1817
+ "Warning: PD separated mode requires --profile-prefill-url or --profile-decode-url"
1818
+ )
1819
+ print("Skipping profiler start. Please specify worker URLs for profiling.")
1820
+
1768
1821
  # Start profiler
1769
1822
  if profile:
1770
- print("Starting profiler...")
1771
- profile_output = await async_request_profile(
1772
- api_url=base_url + "/start_profile"
1773
- )
1774
- if profile_output.success:
1775
- print("Profiler started")
1823
+ if pd_separated:
1824
+ if pd_profile_urls:
1825
+ await _call_profile_pd(pd_profile_urls, "start")
1826
+ else:
1827
+ print("Starting profiler...")
1828
+ profile_output = await async_request_profile(
1829
+ api_url=base_url + "/start_profile"
1830
+ )
1831
+ if profile_output.success:
1832
+ print("Profiler started")
1776
1833
 
1777
1834
  # Run all requests
1778
1835
  benchmark_start_time = time.perf_counter()
@@ -1821,10 +1878,16 @@ async def benchmark(
1821
1878
 
1822
1879
  # Stop profiler
1823
1880
  if profile:
1824
- print("Stopping profiler...")
1825
- profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
1826
- if profile_output.success:
1827
- print("Profiler stopped")
1881
+ if pd_separated:
1882
+ if pd_profile_urls:
1883
+ await _call_profile_pd(pd_profile_urls, "stop")
1884
+ else:
1885
+ print("Stopping profiler...")
1886
+ profile_output = await async_request_profile(
1887
+ api_url=base_url + "/stop_profile"
1888
+ )
1889
+ if profile_output.success:
1890
+ print("Profiler stopped")
1828
1891
 
1829
1892
  if pbar is not None:
1830
1893
  pbar.close()
@@ -2205,6 +2268,8 @@ def run_benchmark(args_: argparse.Namespace):
2205
2268
  use_trace_timestamps=args.use_trace_timestamps,
2206
2269
  mooncake_slowdown_factor=args.mooncake_slowdown_factor,
2207
2270
  mooncake_num_rounds=args.mooncake_num_rounds,
2271
+ profile_prefill_url=getattr(args, "profile_prefill_url", None),
2272
+ profile_decode_url=getattr(args, "profile_decode_url", None),
2208
2273
  )
2209
2274
  )
2210
2275
 
@@ -2430,6 +2495,30 @@ if __name__ == "__main__":
2430
2495
  action="store_true",
2431
2496
  help="Benchmark PD disaggregation server",
2432
2497
  )
2498
+
2499
+ # Create a mutually exclusive group for profiling URLs
2500
+ # In PD separated mode, prefill and decode workers must be profiled separately
2501
+ profile_url_group = parser.add_mutually_exclusive_group()
2502
+ profile_url_group.add_argument(
2503
+ "--profile-prefill-url",
2504
+ type=str,
2505
+ nargs="*",
2506
+ default=None,
2507
+ help="URL(s) of the prefill worker(s) for profiling in PD separated mode. "
2508
+ "Can specify multiple URLs: --profile-prefill-url http://localhost:30000 http://localhost:30001. "
2509
+ "NOTE: Cannot be used together with --profile-decode-url. "
2510
+ "In PD separated mode, prefill and decode workers must be profiled separately.",
2511
+ )
2512
+ profile_url_group.add_argument(
2513
+ "--profile-decode-url",
2514
+ type=str,
2515
+ nargs="*",
2516
+ default=None,
2517
+ help="URL(s) of the decode worker(s) for profiling in PD separated mode. "
2518
+ "Can specify multiple URLs: --profile-decode-url http://localhost:30010 http://localhost:30011. "
2519
+ "NOTE: Cannot be used together with --profile-prefill-url. "
2520
+ "In PD separated mode, prefill and decode workers must be profiled separately.",
2521
+ )
2433
2522
  parser.add_argument(
2434
2523
  "--flush-cache",
2435
2524
  action="store_true",
@@ -19,6 +19,7 @@ import requests
19
19
 
20
20
  from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
21
21
  from sglang.srt.entrypoints.http_server import launch_server
22
+ from sglang.srt.environ import envs
22
23
  from sglang.srt.managers.io_struct import GenerateReqInput
23
24
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
24
25
  from sglang.srt.server_args import ServerArgs
@@ -28,9 +29,9 @@ from sglang.srt.warmup import warmup
28
29
  multiprocessing.set_start_method("spawn", force=True)
29
30
 
30
31
  # Reduce warning
31
- os.environ["SGL_IN_DEEPGEMM_PRECOMPILE_STAGE"] = "1"
32
+ envs.SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE.set(True)
32
33
  # Force enable deep gemm
33
- os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "1"
34
+ envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(True)
34
35
  # Force enable mha chunked kv for DeepSeek V3 to avoid missing kv_b_proj DeepGEMM case
35
36
  os.environ["SGL_CHUNKED_PREFIX_CACHE_THRESHOLD"] = "0"
36
37
 
sglang/global_config.py CHANGED
@@ -1,14 +1,11 @@
1
1
  """Global configurations"""
2
2
 
3
- import os
3
+ # FIXME: deprecate this file and move all usage to sglang.srt.environ or sglang.__init__.py
4
4
 
5
5
 
6
6
  class GlobalConfig:
7
7
  """
8
8
  Store some global constants.
9
-
10
- See also python/sglang/srt/managers/schedule_batch.py::global_server_args_dict, which stores
11
- many global runtime arguments as well.
12
9
  """
13
10
 
14
11
  def __init__(self):
@@ -20,27 +17,6 @@ class GlobalConfig:
20
17
  # Default backend of the language
21
18
  self.default_backend = None
22
19
 
23
- # Runtime constants: New generation token ratio estimation
24
- self.default_init_new_token_ratio = float(
25
- os.environ.get("SGLANG_INIT_NEW_TOKEN_RATIO", 0.7)
26
- )
27
- self.default_min_new_token_ratio_factor = float(
28
- os.environ.get("SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR", 0.14)
29
- )
30
- self.default_new_token_ratio_decay_steps = float(
31
- os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
32
- )
33
- self.torch_empty_cache_interval = float(
34
- os.environ.get(
35
- "SGLANG_EMPTY_CACHE_INTERVAL", -1
36
- ) # in seconds. Set if you observe high memory accumulation over a long serving period.
37
- )
38
- # Runtime constants: others
39
- self.retract_decode_steps = 20
40
- self.flashinfer_workspace_size = int(
41
- os.environ.get("FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024)
42
- )
43
-
44
20
  # Output tokenization configs
45
21
  self.skip_special_tokens_in_output = True
46
22
  self.spaces_between_special_tokens_in_out = True
sglang/lang/api.py CHANGED
@@ -79,6 +79,7 @@ def gen(
79
79
  n: Optional[int] = None,
80
80
  stop: Optional[Union[str, List[str]]] = None,
81
81
  stop_token_ids: Optional[List[int]] = None,
82
+ stop_regex: Optional[Union[str, List[str]]] = None,
82
83
  temperature: Optional[float] = None,
83
84
  top_p: Optional[float] = None,
84
85
  top_k: Optional[int] = None,
@@ -120,6 +121,7 @@ def gen(
120
121
  n,
121
122
  stop,
122
123
  stop_token_ids,
124
+ stop_regex,
123
125
  temperature,
124
126
  top_p,
125
127
  top_k,
@@ -143,6 +145,7 @@ def gen_int(
143
145
  n: Optional[int] = None,
144
146
  stop: Optional[Union[str, List[str]]] = None,
145
147
  stop_token_ids: Optional[List[int]] = None,
148
+ stop_regex: Optional[Union[str, List[str]]] = None,
146
149
  temperature: Optional[float] = None,
147
150
  top_p: Optional[float] = None,
148
151
  top_k: Optional[int] = None,
@@ -162,6 +165,7 @@ def gen_int(
162
165
  n,
163
166
  stop,
164
167
  stop_token_ids,
168
+ stop_regex,
165
169
  temperature,
166
170
  top_p,
167
171
  top_k,
@@ -184,6 +188,7 @@ def gen_string(
184
188
  n: Optional[int] = None,
185
189
  stop: Optional[Union[str, List[str]]] = None,
186
190
  stop_token_ids: Optional[List[int]] = None,
191
+ stop_regex: Optional[Union[str, List[str]]] = None,
187
192
  temperature: Optional[float] = None,
188
193
  top_p: Optional[float] = None,
189
194
  top_k: Optional[int] = None,
@@ -203,6 +208,7 @@ def gen_string(
203
208
  n,
204
209
  stop,
205
210
  stop_token_ids,
211
+ stop_regex,
206
212
  temperature,
207
213
  top_p,
208
214
  top_k,
@@ -792,6 +792,7 @@ class StreamExecutor:
792
792
  "n",
793
793
  "stop",
794
794
  "stop_token_ids",
795
+ "stop_regex",
795
796
  "temperature",
796
797
  "top_p",
797
798
  "top_k",
sglang/lang/ir.py CHANGED
@@ -21,6 +21,7 @@ class SglSamplingParams:
21
21
  n: int = 1
22
22
  stop: Union[str, List[str]] = ()
23
23
  stop_token_ids: Optional[List[int]] = ()
24
+ stop_regex: Optional[Union[str, List[str]]] = ()
24
25
  temperature: float = 1.0
25
26
  top_p: float = 1.0
26
27
  top_k: int = -1 # -1 means disable
@@ -45,6 +46,7 @@ class SglSamplingParams:
45
46
  self.n,
46
47
  self.stop,
47
48
  self.stop_token_ids,
49
+ self.stop_regex,
48
50
  self.temperature,
49
51
  self.top_p,
50
52
  self.top_k,
@@ -123,6 +125,7 @@ class SglSamplingParams:
123
125
  "n": self.n,
124
126
  "stop": self.stop,
125
127
  "stop_token_ids": self.stop_token_ids,
128
+ "stop_regex": self.stop_regex,
126
129
  "temperature": self.temperature,
127
130
  "top_p": self.top_p,
128
131
  "top_k": self.top_k,
@@ -161,6 +164,7 @@ class SglFunction:
161
164
  n: int = 1,
162
165
  stop: Optional[Union[str, List[str]]] = None,
163
166
  stop_token_ids: Optional[List[int]] = None,
167
+ stop_regex: Optional[Union[str, List[str]]] = None,
164
168
  temperature: float = 1.0,
165
169
  top_p: float = 1.0,
166
170
  top_k: int = -1,
@@ -184,12 +188,15 @@ class SglFunction:
184
188
  stop = []
185
189
  if stop_token_ids is None:
186
190
  stop_token_ids = []
191
+ if stop_regex is None:
192
+ stop_regex = []
187
193
 
188
194
  default_sampling_para = SglSamplingParams(
189
195
  max_new_tokens=max_new_tokens,
190
196
  n=n,
191
197
  stop=stop,
192
198
  stop_token_ids=stop_token_ids,
199
+ stop_regex=stop_regex,
193
200
  temperature=temperature,
194
201
  top_p=top_p,
195
202
  top_k=top_k,
@@ -221,6 +228,7 @@ class SglFunction:
221
228
  n: int = 1,
222
229
  stop: Optional[Union[str, List[str]]] = None,
223
230
  stop_token_ids: Optional[List[int]] = None,
231
+ stop_regex: Optional[Union[str, List[str]]] = None,
224
232
  temperature: float = 1.0,
225
233
  top_p: float = 1.0,
226
234
  top_k: int = -1,
@@ -243,6 +251,8 @@ class SglFunction:
243
251
  stop = []
244
252
  if stop_token_ids is None:
245
253
  stop_token_ids = []
254
+ if stop_regex is None:
255
+ stop_regex = []
246
256
 
247
257
  assert isinstance(batch_kwargs, (list, tuple))
248
258
  if len(batch_kwargs) == 0:
@@ -267,6 +277,7 @@ class SglFunction:
267
277
  n=n,
268
278
  stop=stop,
269
279
  stop_token_ids=stop_token_ids,
280
+ stop_regex=stop_regex,
270
281
  temperature=temperature,
271
282
  top_p=top_p,
272
283
  top_k=top_k,
@@ -451,6 +462,7 @@ class SglGen(SglExpr):
451
462
  n: Optional[int] = None,
452
463
  stop: Optional[Union[str, List[str]]] = None,
453
464
  stop_token_ids: Optional[List[int]] = None,
465
+ stop_regex: Optional[Union[str, List[str]]] = None,
454
466
  temperature: Optional[float] = None,
455
467
  top_p: Optional[float] = None,
456
468
  top_k: Optional[int] = None,
@@ -474,6 +486,7 @@ class SglGen(SglExpr):
474
486
  min_new_tokens=min_new_tokens,
475
487
  n=n,
476
488
  stop=stop,
489
+ stop_regex=stop_regex,
477
490
  stop_token_ids=stop_token_ids,
478
491
  temperature=temperature,
479
492
  top_p=top_p,
sglang/launch_server.py CHANGED
@@ -1,9 +1,9 @@
1
1
  """Launch the inference server."""
2
2
 
3
+ import asyncio
3
4
  import os
4
5
  import sys
5
6
 
6
- from sglang.srt.entrypoints.http_server import launch_server
7
7
  from sglang.srt.server_args import prepare_server_args
8
8
  from sglang.srt.utils import kill_process_tree
9
9
 
@@ -11,6 +11,13 @@ if __name__ == "__main__":
11
11
  server_args = prepare_server_args(sys.argv[1:])
12
12
 
13
13
  try:
14
- launch_server(server_args)
14
+ if server_args.grpc_mode:
15
+ from sglang.srt.entrypoints.grpc_server import serve_grpc
16
+
17
+ asyncio.run(serve_grpc(server_args))
18
+ else:
19
+ from sglang.srt.entrypoints.http_server import launch_server
20
+
21
+ launch_server(server_args)
15
22
  finally:
16
23
  kill_process_tree(os.getpid(), include_parent=False)
sglang/profiler.py CHANGED
@@ -25,6 +25,7 @@ def _run_profile(
25
25
  output_dir: Optional[str] = None,
26
26
  profile_name: Optional[str] = None,
27
27
  profile_by_stage: bool = False,
28
+ merge_profiles: bool = False,
28
29
  ) -> str:
29
30
  if output_dir is None:
30
31
  output_dir = PROFILER_DIR
@@ -60,6 +61,7 @@ def _run_profile(
60
61
  "num_steps": str(num_steps),
61
62
  "activities": activities,
62
63
  "profile_by_stage": profile_by_stage,
64
+ "merge_profiles": merge_profiles,
63
65
  }
64
66
 
65
67
  response = requests.post(url=url + "/start_profile", json=json_data)
@@ -76,10 +78,17 @@ def run_profile(
76
78
  output_dir: Optional[str] = None,
77
79
  profile_name: Optional[str] = None,
78
80
  profile_by_stage: bool = False,
81
+ merge_profiles: bool = False,
79
82
  ):
80
83
  # step based profile will self terminate on num_steps constraints
81
84
  link = _run_profile(
82
- url, num_steps, activities, output_dir, profile_name, profile_by_stage
85
+ url,
86
+ num_steps,
87
+ activities,
88
+ output_dir,
89
+ profile_name,
90
+ profile_by_stage,
91
+ merge_profiles,
83
92
  )
84
93
  return link
85
94
 
@@ -145,6 +154,13 @@ if __name__ == "__main__":
145
154
  default=False,
146
155
  help="Whether to use rpd profiler (https://github.com/ROCm/rocmProfileData)",
147
156
  )
157
+ parser.add_argument(
158
+ "--merge-profiles",
159
+ action=argparse.BooleanOptionalAction,
160
+ type=bool,
161
+ default=False,
162
+ help="Whether to merge profiles from all ranks into a single trace file",
163
+ )
148
164
 
149
165
  args = parser.parse_args()
150
166
  activities = []
@@ -163,4 +179,5 @@ if __name__ == "__main__":
163
179
  args.output_dir,
164
180
  args.profile_name,
165
181
  args.profile_by_stage,
182
+ args.merge_profiles,
166
183
  )