sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_offline_throughput.py +19 -0
  3. sglang/bench_one_batch.py +2 -2
  4. sglang/bench_serving.py +123 -79
  5. sglang/global_config.py +8 -3
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/lang/ir.py +1 -1
  8. sglang/srt/_custom_ops.py +83 -91
  9. sglang/srt/configs/load_config.py +4 -1
  10. sglang/srt/configs/model_config.py +48 -2
  11. sglang/srt/configs/qwen2_5_vl_config.py +5 -2
  12. sglang/srt/constrained/base_grammar_backend.py +117 -15
  13. sglang/srt/constrained/llguidance_backend.py +151 -0
  14. sglang/srt/constrained/outlines_backend.py +24 -33
  15. sglang/srt/constrained/xgrammar_backend.py +69 -38
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
  17. sglang/srt/distributed/parallel_state.py +48 -3
  18. sglang/srt/entrypoints/engine.py +67 -9
  19. sglang/srt/entrypoints/http_server.py +190 -41
  20. sglang/srt/entrypoints/verl_engine.py +147 -0
  21. sglang/srt/function_call_parser.py +0 -1
  22. sglang/srt/layers/activation.py +11 -0
  23. sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
  24. sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +302 -414
  26. sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
  27. sglang/srt/layers/attention/torch_native_backend.py +1 -1
  28. sglang/srt/layers/attention/triton_backend.py +13 -8
  29. sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
  30. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
  31. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
  32. sglang/srt/layers/attention/utils.py +39 -0
  33. sglang/srt/layers/attention/vision.py +60 -63
  34. sglang/srt/layers/dp_attention.py +142 -1
  35. sglang/srt/layers/layernorm.py +1 -1
  36. sglang/srt/layers/linear.py +3 -1
  37. sglang/srt/layers/logits_processor.py +281 -45
  38. sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
  39. sglang/srt/layers/moe/ep_moe/layer.py +140 -28
  40. sglang/srt/layers/moe/fused_moe_native.py +2 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
  63. sglang/srt/layers/moe/topk.py +13 -4
  64. sglang/srt/layers/quantization/__init__.py +111 -7
  65. sglang/srt/layers/quantization/blockwise_int8.py +409 -0
  66. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  69. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  71. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  72. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  73. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  76. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  77. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  78. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  80. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  82. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  84. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  85. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  86. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/fp8.py +69 -28
  89. sglang/srt/layers/quantization/fp8_utils.py +17 -1
  90. sglang/srt/layers/quantization/gptq.py +416 -0
  91. sglang/srt/layers/quantization/int8_kernel.py +327 -0
  92. sglang/srt/layers/quantization/int8_utils.py +73 -0
  93. sglang/srt/layers/quantization/modelopt_quant.py +18 -1
  94. sglang/srt/layers/radix_attention.py +1 -0
  95. sglang/srt/layers/rotary_embedding.py +0 -1
  96. sglang/srt/layers/sampler.py +76 -31
  97. sglang/srt/layers/vocab_parallel_embedding.py +14 -13
  98. sglang/srt/lora/lora.py +17 -1
  99. sglang/srt/lora/lora_config.py +5 -0
  100. sglang/srt/lora/lora_manager.py +1 -3
  101. sglang/srt/managers/cache_controller.py +193 -62
  102. sglang/srt/managers/configure_logging.py +2 -1
  103. sglang/srt/managers/data_parallel_controller.py +6 -2
  104. sglang/srt/managers/detokenizer_manager.py +124 -102
  105. sglang/srt/managers/image_processor.py +2 -1
  106. sglang/srt/managers/io_struct.py +144 -6
  107. sglang/srt/managers/schedule_batch.py +237 -197
  108. sglang/srt/managers/schedule_policy.py +29 -29
  109. sglang/srt/managers/scheduler.py +773 -334
  110. sglang/srt/managers/session_controller.py +6 -2
  111. sglang/srt/managers/tokenizer_manager.py +225 -68
  112. sglang/srt/managers/tp_worker.py +15 -4
  113. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  114. sglang/srt/mem_cache/chunk_cache.py +18 -11
  115. sglang/srt/mem_cache/hiradix_cache.py +394 -0
  116. sglang/srt/mem_cache/memory_pool.py +68 -37
  117. sglang/srt/mem_cache/radix_cache.py +58 -47
  118. sglang/srt/metrics/collector.py +102 -36
  119. sglang/srt/model_executor/cuda_graph_runner.py +56 -31
  120. sglang/srt/model_executor/forward_batch_info.py +49 -16
  121. sglang/srt/model_executor/model_runner.py +280 -81
  122. sglang/srt/model_loader/loader.py +3 -3
  123. sglang/srt/model_loader/weight_utils.py +36 -14
  124. sglang/srt/models/baichuan.py +31 -6
  125. sglang/srt/models/chatglm.py +39 -7
  126. sglang/srt/models/commandr.py +29 -5
  127. sglang/srt/models/dbrx.py +31 -5
  128. sglang/srt/models/deepseek.py +43 -6
  129. sglang/srt/models/deepseek_nextn.py +32 -19
  130. sglang/srt/models/deepseek_v2.py +265 -32
  131. sglang/srt/models/exaone.py +19 -9
  132. sglang/srt/models/gemma.py +22 -8
  133. sglang/srt/models/gemma2.py +25 -12
  134. sglang/srt/models/gemma2_reward.py +5 -1
  135. sglang/srt/models/gpt2.py +28 -13
  136. sglang/srt/models/gpt_bigcode.py +27 -5
  137. sglang/srt/models/granite.py +21 -9
  138. sglang/srt/models/grok.py +21 -4
  139. sglang/srt/models/internlm2.py +36 -6
  140. sglang/srt/models/internlm2_reward.py +5 -1
  141. sglang/srt/models/llama.py +26 -9
  142. sglang/srt/models/llama_classification.py +5 -1
  143. sglang/srt/models/llama_eagle.py +17 -4
  144. sglang/srt/models/llama_embedding.py +5 -1
  145. sglang/srt/models/llama_reward.py +7 -2
  146. sglang/srt/models/llava.py +19 -3
  147. sglang/srt/models/llavavid.py +10 -1
  148. sglang/srt/models/minicpm.py +26 -2
  149. sglang/srt/models/minicpm3.py +39 -3
  150. sglang/srt/models/minicpmv.py +45 -14
  151. sglang/srt/models/mixtral.py +20 -9
  152. sglang/srt/models/mixtral_quant.py +50 -8
  153. sglang/srt/models/mllama.py +57 -11
  154. sglang/srt/models/olmo.py +34 -6
  155. sglang/srt/models/olmo2.py +34 -13
  156. sglang/srt/models/olmoe.py +26 -4
  157. sglang/srt/models/phi3_small.py +29 -10
  158. sglang/srt/models/qwen.py +26 -3
  159. sglang/srt/models/qwen2.py +26 -4
  160. sglang/srt/models/qwen2_5_vl.py +46 -8
  161. sglang/srt/models/qwen2_eagle.py +17 -5
  162. sglang/srt/models/qwen2_moe.py +44 -6
  163. sglang/srt/models/qwen2_rm.py +78 -0
  164. sglang/srt/models/qwen2_vl.py +39 -8
  165. sglang/srt/models/stablelm.py +32 -5
  166. sglang/srt/models/torch_native_llama.py +5 -2
  167. sglang/srt/models/xverse.py +21 -9
  168. sglang/srt/models/xverse_moe.py +45 -7
  169. sglang/srt/models/yivl.py +2 -1
  170. sglang/srt/openai_api/adapter.py +109 -24
  171. sglang/srt/openai_api/protocol.py +17 -1
  172. sglang/srt/reasoning_parser.py +154 -0
  173. sglang/srt/sampling/penaltylib/__init__.py +4 -6
  174. sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
  175. sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
  176. sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
  177. sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
  178. sglang/srt/sampling/sampling_batch_info.py +79 -157
  179. sglang/srt/sampling/sampling_params.py +16 -13
  180. sglang/srt/server_args.py +135 -60
  181. sglang/srt/speculative/build_eagle_tree.py +8 -9
  182. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
  183. sglang/srt/speculative/eagle_utils.py +92 -57
  184. sglang/srt/speculative/eagle_worker.py +238 -111
  185. sglang/srt/speculative/spec_info.py +1 -13
  186. sglang/srt/utils.py +43 -17
  187. sglang/srt/warmup.py +47 -0
  188. sglang/test/few_shot_gsm8k.py +4 -1
  189. sglang/test/runners.py +389 -126
  190. sglang/test/send_one.py +88 -0
  191. sglang/test/test_block_fp8_ep.py +361 -0
  192. sglang/test/test_programs.py +1 -1
  193. sglang/test/test_utils.py +138 -84
  194. sglang/utils.py +50 -60
  195. sglang/version.py +1 -1
  196. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
  197. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
  198. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
  199. sglang/bench_latency.py +0 -1
  200. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
  201. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
  202. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
  203. sglang/test/srt/sampling/penaltylib/utils.py +0 -344
  204. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
  205. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -32,13 +32,15 @@ import socket
32
32
  import subprocess
33
33
  import sys
34
34
  import tempfile
35
+ import threading
35
36
  import time
36
37
  import warnings
37
38
  from functools import lru_cache
38
39
  from importlib.metadata import PackageNotFoundError, version
39
40
  from io import BytesIO
41
+ from multiprocessing import Pool
40
42
  from multiprocessing.reduction import ForkingPickler
41
- from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
43
+ from typing import Any, Callable, Dict, List, Optional, Protocol, Set, Tuple, Union
42
44
 
43
45
  import numpy as np
44
46
  import psutil
@@ -311,7 +313,7 @@ def make_layers(
311
313
  """Make a list of layers with the given layer function"""
312
314
  modules = torch.nn.ModuleList(
313
315
  [
314
- maybe_offload_to_cpu(layer_fn(idx=idx, prefix=f"{prefix}.{idx}"))
316
+ maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
315
317
  for idx in range(num_hidden_layers)
316
318
  ]
317
319
  )
@@ -480,6 +482,10 @@ def assert_pkg_version(pkg: str, min_version: str, message: str):
480
482
 
481
483
  def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = None):
482
484
  """Kill the process and all its child processes."""
485
+ # Remove sigchld handler to avoid spammy logs.
486
+ if threading.current_thread() is threading.main_thread():
487
+ signal.signal(signal.SIGCHLD, signal.SIG_DFL)
488
+
483
489
  if parent_pid is None:
484
490
  parent_pid = os.getpid()
485
491
  include_parent = False
@@ -735,13 +741,6 @@ def pytorch_profile(name, func, *args, data_size=-1):
735
741
  return result
736
742
 
737
743
 
738
- def first_rank_print(*args, **kwargs):
739
- if torch.cuda.current_device() == 0:
740
- print(*args, **kwargs)
741
- else:
742
- pass
743
-
744
-
745
744
  def get_zmq_socket(
746
745
  context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
747
746
  ):
@@ -1154,9 +1153,9 @@ def set_gpu_proc_affinity(
1154
1153
 
1155
1154
  if psutil.cpu_count() != psutil.cpu_count(logical=False):
1156
1155
  # HT on
1157
- upper_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
1158
- lower_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
1159
- bind_cpu_ids = list(itertools.chain(upper_cpu_ids, lower_cpu_ids))
1156
+ lower_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
1157
+ upper_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
1158
+ bind_cpu_ids = list(itertools.chain(lower_cpu_ids, upper_cpu_ids))
1160
1159
  else:
1161
1160
  # HT off
1162
1161
  bind_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
@@ -1171,6 +1170,11 @@ def get_bool_env_var(name: str, default: str = "false") -> bool:
1171
1170
  return value.lower() in ("true", "1")
1172
1171
 
1173
1172
 
1173
+ @lru_cache(maxsize=2)
1174
+ def disable_request_logging() -> bool:
1175
+ return get_bool_env_var("SGLANG_DISABLE_REQUEST_LOGGING")
1176
+
1177
+
1174
1178
  @lru_cache(maxsize=8)
1175
1179
  def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) -> int:
1176
1180
  # Note: cuda_visible_devices is not used, but we keep it as an argument for
@@ -1212,7 +1216,11 @@ def cuda_device_count_stateless() -> int:
1212
1216
  return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES", None))
1213
1217
 
1214
1218
 
1215
- def dataclass_to_string_truncated(data, max_length=2048):
1219
+ def dataclass_to_string_truncated(
1220
+ data, max_length=2048, skip_names: Optional[Set[str]] = None
1221
+ ):
1222
+ if skip_names is None:
1223
+ skip_names = set()
1216
1224
  if isinstance(data, str):
1217
1225
  if len(data) > max_length:
1218
1226
  half_length = max_length // 2
@@ -1231,6 +1239,7 @@ def dataclass_to_string_truncated(data, max_length=2048):
1231
1239
  + ", ".join(
1232
1240
  f"'{k}': {dataclass_to_string_truncated(v, max_length)}"
1233
1241
  for k, v in data.items()
1242
+ if k not in skip_names
1234
1243
  )
1235
1244
  + "}"
1236
1245
  )
@@ -1241,6 +1250,7 @@ def dataclass_to_string_truncated(data, max_length=2048):
1241
1250
  + ", ".join(
1242
1251
  f"{f.name}={dataclass_to_string_truncated(getattr(data, f.name), max_length)}"
1243
1252
  for f in fields
1253
+ if f.name not in skip_names
1244
1254
  )
1245
1255
  + ")"
1246
1256
  )
@@ -1289,7 +1299,7 @@ def debug_timing(func):
1289
1299
  tic.record()
1290
1300
  result = func(*args, **kwargs)
1291
1301
  toc.record()
1292
- torch.cuda.synchronize() # Ensure all CUDA operations are complete
1302
+ toc.synchronize() # Wait for the function to complete without synchronizing all ops on the GPU
1293
1303
  elapsed = tic.elapsed_time(toc)
1294
1304
  indices = kwargs.get("indices", args[1] if len(args) > 1 else None)
1295
1305
  num_tokens = len(indices) if indices is not None else 0
@@ -1319,9 +1329,9 @@ def pyspy_dump_schedulers():
1319
1329
  result = subprocess.run(
1320
1330
  cmd, shell=True, capture_output=True, text=True, check=True
1321
1331
  )
1322
- logger.info(f"Profile for PID {pid}:\n{result.stdout}")
1332
+ logger.error(f"Pyspy dump for PID {pid}:\n{result.stdout}")
1323
1333
  except subprocess.CalledProcessError as e:
1324
- logger.info(f"Failed to profile PID {pid}. Error: {e.stderr}")
1334
+ logger.error(f"Pyspy failed to dump PID {pid}. Error: {e.stderr}")
1325
1335
 
1326
1336
 
1327
1337
  def kill_itself_when_parent_died():
@@ -1383,7 +1393,6 @@ def get_ip() -> str:
1383
1393
 
1384
1394
 
1385
1395
  def get_open_port() -> int:
1386
-
1387
1396
  port = os.getenv("SGLANG_PORT")
1388
1397
  if port is not None:
1389
1398
  while True:
@@ -1446,8 +1455,25 @@ def launch_dummy_health_check_server(host, port):
1446
1455
  )
1447
1456
 
1448
1457
 
1458
+ def create_checksum(directory: str):
1459
+ raise NotImplementedError()
1460
+
1461
+
1449
1462
  def set_cuda_arch():
1450
1463
  if is_flashinfer_available():
1451
1464
  capability = torch.cuda.get_device_capability()
1452
1465
  arch = f"{capability[0]}.{capability[1]}"
1453
1466
  os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
1467
+
1468
+
1469
+ def add_prefix(name: str, prefix: str) -> str:
1470
+ """Add a weight path prefix to a module name.
1471
+
1472
+ Args:
1473
+ name: base module name.
1474
+ prefix: weight prefix str to added to the front of `name` concatenated with `.`.
1475
+
1476
+ Returns:
1477
+ The string `prefix.name` if prefix is non-empty, otherwise just `name`.
1478
+ """
1479
+ return name if not prefix else f"{prefix}.{name}"
sglang/srt/warmup.py ADDED
@@ -0,0 +1,47 @@
1
+ import logging
2
+ from typing import List
3
+
4
+ import numpy as np
5
+ import tqdm
6
+
7
+ from sglang.srt.managers.io_struct import GenerateReqInput
8
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
9
+
10
+ logger = logging.getLogger(__file__)
11
+
12
+ _warmup_registry = {}
13
+
14
+
15
+ def warmup(name: str) -> callable:
16
+ def decorator(fn: callable):
17
+ _warmup_registry[name] = fn
18
+ return fn
19
+
20
+ return decorator
21
+
22
+
23
+ async def execute_warmups(warmup_names: List[str], tokenizer_manager: TokenizerManager):
24
+ for warmup_name in warmup_names:
25
+ if warmup_name not in _warmup_registry:
26
+ logger.warning(f"Could not find custom warmup {warmup_name}")
27
+ continue
28
+ logger.info(f"Running warmup {warmup_name}")
29
+ await _warmup_registry[warmup_name](tokenizer_manager)
30
+
31
+
32
+ @warmup("voice_chat")
33
+ async def voice_chat(tokenizer_manager: TokenizerManager):
34
+ # this warms up the fused_moe triton kernels and caches them
35
+ # if we don't do this we break real time inference for voice chat
36
+ for i in tqdm.trange(1, 512):
37
+ size = i * 4
38
+ generate_req_input = GenerateReqInput(
39
+ input_ids=(np.random.randint(2**16, size=[size])).tolist(),
40
+ sampling_params={
41
+ "max_new_tokens": 30,
42
+ "temperature": 0.8,
43
+ "stop_token_ids": [1],
44
+ "min_p": 0.0,
45
+ },
46
+ )
47
+ await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
@@ -93,9 +93,11 @@ def run_eval(args):
93
93
  tic = time.time()
94
94
  states = few_shot_gsm8k.run_batch(
95
95
  arguments,
96
- temperature=0,
96
+ temperature=args.temperature if hasattr(args, "temperature") else 0,
97
97
  num_threads=args.parallel,
98
98
  progress_bar=True,
99
+ return_logprob=getattr(args, "return_logprob", None),
100
+ logprob_start_len=getattr(args, "logprob_start_len", None),
99
101
  )
100
102
  latency = time.time() - tic
101
103
 
@@ -141,5 +143,6 @@ if __name__ == "__main__":
141
143
  parser.add_argument("--parallel", type=int, default=128)
142
144
  parser.add_argument("--host", type=str, default="http://127.0.0.1")
143
145
  parser.add_argument("--port", type=int, default=30000)
146
+ parser.add_argument("--temperature", type=float, default=0.0)
144
147
  args = parser.parse_args()
145
148
  run_eval(args)