sglang 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +10 -1
  3. sglang/bench_serving.py +251 -26
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/srt/configs/__init__.py +4 -0
  6. sglang/srt/configs/internvl.py +6 -0
  7. sglang/srt/configs/longcat_flash.py +104 -0
  8. sglang/srt/configs/model_config.py +37 -7
  9. sglang/srt/configs/qwen3_next.py +326 -0
  10. sglang/srt/connector/__init__.py +1 -1
  11. sglang/srt/connector/base_connector.py +1 -2
  12. sglang/srt/connector/redis.py +2 -2
  13. sglang/srt/connector/serde/__init__.py +1 -1
  14. sglang/srt/connector/serde/safe_serde.py +4 -3
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/ascend/conn.py +75 -0
  21. sglang/srt/disaggregation/base/conn.py +1 -1
  22. sglang/srt/disaggregation/common/conn.py +15 -12
  23. sglang/srt/disaggregation/decode.py +6 -4
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -420
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +6 -4
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +94 -58
  31. sglang/srt/entrypoints/engine.py +34 -14
  32. sglang/srt/entrypoints/http_server.py +172 -47
  33. sglang/srt/entrypoints/openai/protocol.py +63 -3
  34. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  35. sglang/srt/entrypoints/openai/serving_chat.py +34 -19
  36. sglang/srt/entrypoints/openai/serving_completions.py +10 -4
  37. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  38. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  39. sglang/srt/eplb/eplb_manager.py +28 -4
  40. sglang/srt/eplb/expert_distribution.py +55 -15
  41. sglang/srt/eplb/expert_location.py +8 -3
  42. sglang/srt/eplb/expert_location_updater.py +1 -1
  43. sglang/srt/function_call/ebnf_composer.py +11 -9
  44. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  45. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  46. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  47. sglang/srt/hf_transformers_utils.py +12 -0
  48. sglang/srt/layers/activation.py +44 -9
  49. sglang/srt/layers/attention/aiter_backend.py +93 -68
  50. sglang/srt/layers/attention/ascend_backend.py +250 -112
  51. sglang/srt/layers/attention/fla/chunk.py +242 -0
  52. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  53. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  54. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  55. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  56. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  57. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  58. sglang/srt/layers/attention/fla/index.py +37 -0
  59. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  60. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  61. sglang/srt/layers/attention/fla/op.py +66 -0
  62. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  63. sglang/srt/layers/attention/fla/utils.py +331 -0
  64. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  65. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  66. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  67. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  68. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  69. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  70. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  71. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  72. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  73. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  74. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  75. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  76. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  77. sglang/srt/layers/communicator.py +45 -7
  78. sglang/srt/layers/layernorm.py +54 -12
  79. sglang/srt/layers/logits_processor.py +10 -3
  80. sglang/srt/layers/moe/__init__.py +2 -1
  81. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  82. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  83. sglang/srt/layers/moe/ep_moe/layer.py +110 -49
  84. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  85. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  94. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  95. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  96. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  97. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  98. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  99. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  100. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  101. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  102. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  103. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  104. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  105. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  106. sglang/srt/layers/moe/topk.py +43 -12
  107. sglang/srt/layers/moe/utils.py +6 -5
  108. sglang/srt/layers/quantization/awq.py +19 -7
  109. sglang/srt/layers/quantization/base_config.py +11 -6
  110. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  111. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  112. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  113. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
  114. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
  115. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  116. sglang/srt/layers/quantization/fp8.py +76 -47
  117. sglang/srt/layers/quantization/fp8_utils.py +43 -29
  118. sglang/srt/layers/quantization/gptq.py +25 -17
  119. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  120. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  121. sglang/srt/layers/quantization/mxfp4.py +77 -45
  122. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  123. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  124. sglang/srt/layers/quantization/quark/utils.py +97 -0
  125. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  126. sglang/srt/layers/quantization/unquant.py +135 -47
  127. sglang/srt/layers/quantization/utils.py +13 -0
  128. sglang/srt/layers/quantization/w4afp8.py +60 -42
  129. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  130. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  131. sglang/srt/layers/rocm_linear_utils.py +44 -0
  132. sglang/srt/layers/rotary_embedding.py +28 -19
  133. sglang/srt/layers/sampler.py +29 -5
  134. sglang/srt/lora/backend/base_backend.py +50 -8
  135. sglang/srt/lora/backend/triton_backend.py +90 -2
  136. sglang/srt/lora/layers.py +32 -0
  137. sglang/srt/lora/lora.py +4 -1
  138. sglang/srt/lora/lora_manager.py +35 -112
  139. sglang/srt/lora/mem_pool.py +24 -10
  140. sglang/srt/lora/utils.py +18 -9
  141. sglang/srt/managers/cache_controller.py +242 -278
  142. sglang/srt/managers/data_parallel_controller.py +30 -15
  143. sglang/srt/managers/detokenizer_manager.py +13 -2
  144. sglang/srt/managers/disagg_service.py +46 -0
  145. sglang/srt/managers/io_struct.py +160 -11
  146. sglang/srt/managers/mm_utils.py +6 -1
  147. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  148. sglang/srt/managers/schedule_batch.py +27 -44
  149. sglang/srt/managers/schedule_policy.py +4 -3
  150. sglang/srt/managers/scheduler.py +90 -115
  151. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  152. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  153. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  154. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  155. sglang/srt/managers/template_manager.py +3 -3
  156. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  157. sglang/srt/managers/tokenizer_manager.py +41 -477
  158. sglang/srt/managers/tp_worker.py +16 -4
  159. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  160. sglang/srt/mem_cache/allocator.py +1 -1
  161. sglang/srt/mem_cache/chunk_cache.py +1 -1
  162. sglang/srt/mem_cache/hicache_storage.py +24 -22
  163. sglang/srt/mem_cache/hiradix_cache.py +184 -101
  164. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  165. sglang/srt/mem_cache/memory_pool.py +324 -41
  166. sglang/srt/mem_cache/memory_pool_host.py +25 -18
  167. sglang/srt/mem_cache/radix_cache.py +5 -6
  168. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  169. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  170. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  171. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  172. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +149 -12
  173. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  174. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  175. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +74 -19
  176. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  177. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  178. sglang/srt/metrics/collector.py +484 -63
  179. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  180. sglang/srt/metrics/utils.py +48 -0
  181. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  182. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  183. sglang/srt/model_executor/forward_batch_info.py +72 -18
  184. sglang/srt/model_executor/model_runner.py +189 -31
  185. sglang/srt/model_loader/__init__.py +9 -3
  186. sglang/srt/model_loader/loader.py +33 -28
  187. sglang/srt/model_loader/utils.py +12 -0
  188. sglang/srt/model_loader/weight_utils.py +2 -1
  189. sglang/srt/models/deepseek_v2.py +311 -50
  190. sglang/srt/models/gemma3n_mm.py +1 -1
  191. sglang/srt/models/glm4_moe.py +10 -1
  192. sglang/srt/models/glm4v.py +4 -2
  193. sglang/srt/models/gpt_oss.py +5 -18
  194. sglang/srt/models/internvl.py +28 -0
  195. sglang/srt/models/llama4.py +9 -0
  196. sglang/srt/models/llama_eagle3.py +17 -0
  197. sglang/srt/models/longcat_flash.py +1026 -0
  198. sglang/srt/models/longcat_flash_nextn.py +699 -0
  199. sglang/srt/models/minicpmv.py +165 -3
  200. sglang/srt/models/mllama4.py +25 -0
  201. sglang/srt/models/opt.py +637 -0
  202. sglang/srt/models/qwen2.py +33 -3
  203. sglang/srt/models/qwen2_5_vl.py +90 -42
  204. sglang/srt/models/qwen2_moe.py +79 -14
  205. sglang/srt/models/qwen3.py +8 -2
  206. sglang/srt/models/qwen3_moe.py +39 -8
  207. sglang/srt/models/qwen3_next.py +1039 -0
  208. sglang/srt/models/qwen3_next_mtp.py +109 -0
  209. sglang/srt/models/torch_native_llama.py +1 -1
  210. sglang/srt/models/transformers.py +1 -1
  211. sglang/srt/multimodal/processors/base_processor.py +4 -2
  212. sglang/srt/multimodal/processors/glm4v.py +9 -9
  213. sglang/srt/multimodal/processors/internvl.py +141 -129
  214. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  215. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  216. sglang/srt/sampling/sampling_batch_info.py +18 -15
  217. sglang/srt/server_args.py +297 -79
  218. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  219. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  220. sglang/srt/speculative/eagle_worker.py +216 -120
  221. sglang/srt/speculative/spec_info.py +5 -0
  222. sglang/srt/speculative/standalone_worker.py +109 -0
  223. sglang/srt/utils.py +37 -2
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  226. sglang/test/few_shot_gsm8k.py +1 -0
  227. sglang/test/runners.py +4 -0
  228. sglang/test/test_cutlass_moe.py +24 -6
  229. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  230. sglang/test/test_disaggregation_utils.py +66 -0
  231. sglang/test/test_utils.py +25 -1
  232. sglang/utils.py +5 -0
  233. sglang/version.py +1 -1
  234. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/METADATA +11 -9
  235. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/RECORD +243 -194
  236. sglang/srt/disaggregation/launch_lb.py +0 -131
  237. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  238. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  239. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  240. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  241. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  242. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  243. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  244. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  245. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
@@ -14,10 +14,12 @@
14
14
  """Utilities for Prometheus Metrics Collection."""
15
15
 
16
16
  import time
17
- from dataclasses import dataclass
17
+ from dataclasses import dataclass, field
18
18
  from enum import Enum
19
19
  from typing import Dict, List, Optional, Union
20
20
 
21
+ from sglang.srt.metrics.utils import generate_buckets
22
+ from sglang.srt.server_args import ServerArgs
21
23
  from sglang.srt.utils import get_bool_env_var
22
24
 
23
25
  SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
@@ -48,6 +50,9 @@ class TimeStats:
48
50
  DECODE = "decode"
49
51
  INVALID = "invalid"
50
52
 
53
+ def get_queueing_time(self) -> float:
54
+ return self.forward_entry_time - self.wait_queue_entry_time
55
+
51
56
  def __str__(self) -> str:
52
57
  # if unified
53
58
  _type = self.get_type()
@@ -132,27 +137,48 @@ class TimeStats:
132
137
 
133
138
  @dataclass
134
139
  class SchedulerStats:
140
+ # Basics
135
141
  num_running_reqs: int = 0
136
142
  num_used_tokens: int = 0
137
143
  token_usage: float = 0.0
144
+ swa_token_usage: float = 0.0
138
145
  gen_throughput: float = 0.0
139
146
  num_queue_reqs: int = 0
140
- cache_hit_rate: float = 0.0
141
147
  num_grammar_queue_reqs: int = 0
142
- spec_accept_length: float = 0.0
148
+ num_running_reqs_offline_batch: int = 0
143
149
  avg_request_queue_latency: float = 0.0
150
+ cache_hit_rate: float = 0.0
151
+
152
+ # Speculative decoding
153
+ spec_accept_length: float = 0.0
154
+
155
+ # PD disaggregation
144
156
  num_prefill_prealloc_queue_reqs: int = 0
145
157
  num_prefill_inflight_queue_reqs: int = 0
146
158
  num_decode_prealloc_queue_reqs: int = 0
147
159
  num_decode_transfer_queue_reqs: int = 0
160
+ kv_transfer_speed_gb_s: float = 0.0
161
+ kv_transfer_latency_ms: float = 0.0
162
+
163
+ # Retract
148
164
  total_retracted_reqs: int = 0
165
+ num_retracted_reqs: int = 0
166
+ num_paused_reqs: int = 0
167
+
168
+ # Utilization
169
+ utilization: float = 0.0
170
+ max_running_requests_under_SLO: Optional[int] = None
171
+
172
+ # Engine startup
173
+ engine_startup_time: float = 0.0
174
+ engine_load_weights_time: float = 0.0
149
175
 
150
176
 
151
177
  class SchedulerMetricsCollector:
152
178
 
153
179
  def __init__(self, labels: Dict[str, str]) -> None:
154
180
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
155
- from prometheus_client import Counter, Gauge
181
+ from prometheus_client import Counter, Gauge, Histogram
156
182
 
157
183
  self.labels = labels
158
184
  self.last_log_time = time.perf_counter()
@@ -163,115 +189,338 @@ class SchedulerMetricsCollector:
163
189
  labelnames=labels.keys(),
164
190
  multiprocess_mode="mostrecent",
165
191
  )
166
-
167
192
  self.num_used_tokens = Gauge(
168
193
  name="sglang:num_used_tokens",
169
194
  documentation="The number of used tokens.",
170
195
  labelnames=labels.keys(),
171
196
  multiprocess_mode="mostrecent",
172
197
  )
173
-
174
198
  self.token_usage = Gauge(
175
199
  name="sglang:token_usage",
176
200
  documentation="The token usage.",
177
201
  labelnames=labels.keys(),
178
202
  multiprocess_mode="mostrecent",
179
203
  )
180
-
204
+ self.swa_token_usage = Gauge(
205
+ name="sglang:swa_token_usage",
206
+ documentation="The token usage for SWA layers.",
207
+ labelnames=labels.keys(),
208
+ multiprocess_mode="mostrecent",
209
+ )
181
210
  self.gen_throughput = Gauge(
182
211
  name="sglang:gen_throughput",
183
212
  documentation="The generation throughput (token/s).",
184
213
  labelnames=labels.keys(),
185
214
  multiprocess_mode="mostrecent",
186
215
  )
187
-
188
216
  self.num_queue_reqs = Gauge(
189
217
  name="sglang:num_queue_reqs",
190
218
  documentation="The number of requests in the waiting queue.",
191
219
  labelnames=labels.keys(),
192
220
  multiprocess_mode="mostrecent",
193
221
  )
194
-
195
222
  self.num_grammar_queue_reqs = Gauge(
196
223
  name="sglang:num_grammar_queue_reqs",
197
224
  documentation="The number of requests in the grammar waiting queue.",
198
225
  labelnames=labels.keys(),
199
226
  multiprocess_mode="mostrecent",
200
227
  )
201
-
202
- self.cache_hit_rate = Gauge(
203
- name="sglang:cache_hit_rate",
204
- documentation="The prefix cache hit rate.",
205
- labelnames=labels.keys(),
206
- multiprocess_mode="mostrecent",
207
- )
208
-
209
- self.spec_accept_length = Gauge(
210
- name="sglang:spec_accept_length",
211
- documentation="The average acceptance length of speculative decoding.",
228
+ self.num_running_reqs_offline_batch = Gauge(
229
+ name="sglang:num_running_reqs_offline_batch",
230
+ documentation="The number of running low-priority offline batch requests(label is 'batch').",
212
231
  labelnames=labels.keys(),
213
232
  multiprocess_mode="mostrecent",
214
233
  )
215
-
216
234
  self.avg_request_queue_latency = Gauge(
217
235
  name="sglang:avg_request_queue_latency",
218
236
  documentation="The average request queue latency for the last batch of requests in seconds.",
219
237
  labelnames=labels.keys(),
220
238
  multiprocess_mode="mostrecent",
221
239
  )
240
+ self.cache_hit_rate = Gauge(
241
+ name="sglang:cache_hit_rate",
242
+ documentation="The prefix cache hit rate.",
243
+ labelnames=labels.keys(),
244
+ multiprocess_mode="mostrecent",
245
+ )
222
246
 
223
- self.total_retracted_reqs = Gauge(
224
- name="sglang:total_retracted_reqs",
225
- documentation="The total number of retracted requests due to kvcache full.",
247
+ # Speculative decoding
248
+ self.spec_accept_length = Gauge(
249
+ name="sglang:spec_accept_length",
250
+ documentation="The average acceptance length of speculative decoding.",
226
251
  labelnames=labels.keys(),
227
252
  multiprocess_mode="mostrecent",
228
253
  )
229
254
 
230
- # Disaggregation queue metrics
255
+ # PD disaggregation
231
256
  self.num_prefill_prealloc_queue_reqs = Gauge(
232
257
  name="sglang:num_prefill_prealloc_queue_reqs",
233
258
  documentation="The number of requests in the prefill prealloc queue.",
234
259
  labelnames=labels.keys(),
235
260
  multiprocess_mode="mostrecent",
236
261
  )
237
-
238
262
  self.num_prefill_inflight_queue_reqs = Gauge(
239
263
  name="sglang:num_prefill_inflight_queue_reqs",
240
264
  documentation="The number of requests in the prefill inflight queue.",
241
265
  labelnames=labels.keys(),
242
266
  multiprocess_mode="mostrecent",
243
267
  )
244
-
245
268
  self.num_decode_prealloc_queue_reqs = Gauge(
246
269
  name="sglang:num_decode_prealloc_queue_reqs",
247
270
  documentation="The number of requests in the decode prealloc queue.",
248
271
  labelnames=labels.keys(),
249
272
  multiprocess_mode="mostrecent",
250
273
  )
251
-
252
274
  self.num_decode_transfer_queue_reqs = Gauge(
253
275
  name="sglang:num_decode_transfer_queue_reqs",
254
276
  documentation="The number of requests in the decode transfer queue.",
255
277
  labelnames=labels.keys(),
256
278
  multiprocess_mode="mostrecent",
257
279
  )
258
-
259
280
  self.num_bootstrap_failed_reqs = Counter(
260
- name="sglang:num_bootstrap_failed_reqs",
281
+ name="sglang:num_bootstrap_failed_reqs_total",
261
282
  documentation="The number of bootstrap failed requests.",
262
283
  labelnames=labels.keys(),
263
284
  )
264
-
265
285
  self.num_transfer_failed_reqs = Counter(
266
- name="sglang:num_transfer_failed_reqs",
286
+ name="sglang:num_transfer_failed_reqs_total",
267
287
  documentation="The number of transfer failed requests.",
268
288
  labelnames=labels.keys(),
269
289
  )
290
+ self.kv_transfer_speed_gb_s = Gauge(
291
+ name="sglang:kv_transfer_speed_gb_s",
292
+ documentation="The transfer speed of the KV cache in GB/s.",
293
+ labelnames=labels.keys(),
294
+ multiprocess_mode="mostrecent",
295
+ )
296
+ self.kv_transfer_latency_ms = Gauge(
297
+ name="sglang:kv_transfer_latency_ms",
298
+ documentation="The transfer latency of the KV cache in ms.",
299
+ labelnames=labels.keys(),
300
+ multiprocess_mode="mostrecent",
301
+ )
302
+
303
+ # Retract
304
+ self.total_retracted_reqs = Gauge(
305
+ name="sglang:total_retracted_reqs",
306
+ documentation="The total number of retracted requests due to kvcache full.",
307
+ labelnames=labels.keys(),
308
+ multiprocess_mode="mostrecent",
309
+ )
310
+ self.num_retracted_reqs = Gauge(
311
+ name="sglang:num_retracted_reqs",
312
+ documentation="The number of retracted requests.",
313
+ labelnames=labels.keys(),
314
+ )
315
+ self.num_paused_reqs = Gauge(
316
+ name="sglang:num_paused_reqs",
317
+ documentation="The number of paused requests by async weight sync.",
318
+ labelnames=labels.keys(),
319
+ )
320
+
321
+ # Utilization
322
+ self.utilization = Gauge(
323
+ name="sglang:utilization",
324
+ documentation="The utilization.",
325
+ labelnames=labels.keys(),
326
+ multiprocess_mode="mostrecent",
327
+ )
328
+ self.max_running_requests_under_SLO = Gauge(
329
+ name="sglang:max_running_requests_under_SLO",
330
+ documentation="The maximum number of running requests under SLO.",
331
+ labelnames=labels.keys(),
332
+ multiprocess_mode="mostrecent",
333
+ )
334
+
335
+ # Engine startup
336
+ self.engine_startup_time = Gauge(
337
+ name="sglang:engine_startup_time",
338
+ documentation="The time taken for the engine to start up.",
339
+ labelnames=labels.keys(),
340
+ multiprocess_mode="mostrecent",
341
+ )
342
+ self.engine_load_weights_time = Gauge(
343
+ name="sglang:engine_load_weights_time",
344
+ documentation="The time taken for the engine to load weights.",
345
+ labelnames=labels.keys(),
346
+ multiprocess_mode="mostrecent",
347
+ )
348
+
349
+ # Additional queueing time histogram
350
+ self.queue_time = Histogram(
351
+ name="sglang:queue_time_s",
352
+ documentation="Histogram of queueing time in seconds.",
353
+ labelnames=labels.keys(),
354
+ buckets=[
355
+ 0.0,
356
+ 0.1,
357
+ 0.2,
358
+ 0.5,
359
+ 1,
360
+ 2,
361
+ 3,
362
+ 4,
363
+ 5,
364
+ 10,
365
+ 15,
366
+ 20,
367
+ 30,
368
+ 40,
369
+ 50,
370
+ 60,
371
+ 70,
372
+ 80,
373
+ 90,
374
+ 100,
375
+ 200,
376
+ 300,
377
+ 400,
378
+ 500,
379
+ 600,
380
+ 700,
381
+ 800,
382
+ 900,
383
+ 1000,
384
+ 1200,
385
+ 1400,
386
+ 1600,
387
+ 1800,
388
+ 2000,
389
+ 2500,
390
+ 3000,
391
+ ],
392
+ )
393
+
394
+ # Grammar metrics
395
+ self.grammar_compilation_time = Histogram(
396
+ name="sglang:grammar_compilation_time_seconds",
397
+ documentation="Histogram of grammar compilation time in seconds.",
398
+ labelnames=labels.keys(),
399
+ buckets=[
400
+ 0.0,
401
+ 0.01,
402
+ 0.02,
403
+ 0.05,
404
+ 0.1,
405
+ 0.2,
406
+ 0.5,
407
+ 1,
408
+ 2,
409
+ 5,
410
+ 10,
411
+ 20,
412
+ 30,
413
+ 60,
414
+ 90,
415
+ 120,
416
+ 240,
417
+ ],
418
+ )
419
+ self.num_grammar_cache_hit = Counter(
420
+ name="sglang:num_grammar_cache_hit_total",
421
+ documentation="Number of grammar cache hits.",
422
+ labelnames=labels.keys(),
423
+ )
424
+ self.num_grammar_aborted = Counter(
425
+ name="sglang:num_grammar_aborted_total",
426
+ documentation="Number of grammar aborted requests.",
427
+ labelnames=labels.keys(),
428
+ )
429
+ self.num_grammar_total = Counter(
430
+ name="sglang:num_grammar_total",
431
+ documentation="Number of the total grammar requests.",
432
+ labelnames=labels.keys(),
433
+ )
434
+ self.grammar_schema_count = Histogram(
435
+ name="sglang:grammar_schema_count",
436
+ documentation="Histogram of grammar schema count.",
437
+ labelnames=labels.keys(),
438
+ buckets=[
439
+ 0,
440
+ 1,
441
+ 2,
442
+ 5,
443
+ 10,
444
+ 20,
445
+ 30,
446
+ 40,
447
+ 60,
448
+ 80,
449
+ 100,
450
+ 120,
451
+ 140,
452
+ 160,
453
+ 180,
454
+ 200,
455
+ 300,
456
+ 400,
457
+ 500,
458
+ 700,
459
+ 1000,
460
+ ],
461
+ )
462
+ self.grammar_ebnf_size = Histogram(
463
+ name="sglang:grammar_ebnf_size",
464
+ documentation="Histogram of grammar EBNF size.",
465
+ labelnames=labels.keys(),
466
+ buckets=[
467
+ 0,
468
+ 50,
469
+ 100,
470
+ 200,
471
+ 300,
472
+ 500,
473
+ 1000,
474
+ 2000,
475
+ 3000,
476
+ 5000,
477
+ 10000,
478
+ 20000,
479
+ 30000,
480
+ 50000,
481
+ 100000,
482
+ ],
483
+ )
484
+
485
+ tree_traversal_time_buckets = [
486
+ 0.0,
487
+ 0.01,
488
+ 0.02,
489
+ 0.05,
490
+ 0.1,
491
+ 0.2,
492
+ 0.5,
493
+ 1,
494
+ 2,
495
+ 5,
496
+ 10,
497
+ 15,
498
+ 30,
499
+ 60,
500
+ 90,
501
+ 120,
502
+ 240,
503
+ ]
504
+ self.grammar_tree_traversal_time_avg = Histogram(
505
+ name="sglang:grammar_tree_traversal_time_avg",
506
+ documentation="Histogram of average grammar tree traversal time in seconds.",
507
+ labelnames=labels.keys(),
508
+ buckets=tree_traversal_time_buckets,
509
+ )
510
+ self.grammar_tree_traversal_time_max = Histogram(
511
+ name="sglang:grammar_tree_traversal_time_max",
512
+ documentation="Histogram of max grammar tree traversal time in seconds.",
513
+ labelnames=labels.keys(),
514
+ buckets=tree_traversal_time_buckets,
515
+ )
270
516
 
271
517
  def _log_gauge(self, gauge, data: Union[int, float]) -> None:
272
518
  # Convenience function for logging to gauge.
273
519
  gauge.labels(**self.labels).set(data)
274
520
 
521
+ def log_histogram(self, histogram, data: Union[int, float]) -> None:
522
+ histogram.labels(**self.labels).observe(data)
523
+
275
524
  def increment_bootstrap_failed_reqs(self) -> None:
276
525
  self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
277
526
 
@@ -282,14 +531,19 @@ class SchedulerMetricsCollector:
282
531
  self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
283
532
  self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
284
533
  self._log_gauge(self.token_usage, stats.token_usage)
534
+ self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
285
535
  self._log_gauge(self.gen_throughput, stats.gen_throughput)
286
536
  self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
287
537
  self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
538
+ self._log_gauge(
539
+ self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
540
+ )
288
541
  self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
542
+
543
+ # Speculative decoding
289
544
  self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
290
- self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
291
545
 
292
- # Disaggregation metrics
546
+ # PD disaggregation
293
547
  self._log_gauge(
294
548
  self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
295
549
  )
@@ -302,14 +556,59 @@ class SchedulerMetricsCollector:
302
556
  self._log_gauge(
303
557
  self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
304
558
  )
559
+ self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
560
+ self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
561
+
562
+ # Retract
563
+ self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
564
+ self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
565
+ self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
566
+
567
+ # Utilization
568
+ self._log_gauge(self.utilization, stats.utilization)
569
+ if stats.max_running_requests_under_SLO is not None:
570
+ self._log_gauge(
571
+ self.max_running_requests_under_SLO,
572
+ stats.max_running_requests_under_SLO,
573
+ )
574
+
575
+ # Engine startup time
576
+ self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
577
+ if stats.engine_load_weights_time is not None:
578
+ self._log_gauge(
579
+ self.engine_load_weights_time, stats.engine_load_weights_time
580
+ )
305
581
 
306
582
  self.last_log_time = time.perf_counter()
307
583
 
584
+ def log_grammar_stats(self, grammar_stats) -> None:
585
+ # Duck-typed GrammarStats to avoid cross-package dependency
586
+ if getattr(grammar_stats, "compilation_time", None) is not None:
587
+ self.log_histogram(
588
+ self.grammar_compilation_time, grammar_stats.compilation_time
589
+ )
590
+ if getattr(grammar_stats, "schema_count", None) is not None:
591
+ self.log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
592
+ if getattr(grammar_stats, "ebnf_size", None) is not None:
593
+ self.log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
594
+ tree_times = getattr(grammar_stats, "tree_traversal_time", None)
595
+ if tree_times:
596
+ max_time = max(tree_times)
597
+ avg_time = sum(tree_times) / len(tree_times)
598
+ self.log_histogram(self.grammar_tree_traversal_time_max, max_time)
599
+ self.log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
600
+ if getattr(grammar_stats, "is_cache_hit", False):
601
+ self.num_grammar_cache_hit.labels(**self.labels).inc(1)
602
+ if getattr(grammar_stats, "is_grammar_aborted", False):
603
+ self.num_grammar_aborted.labels(**self.labels).inc(1)
604
+ self.num_grammar_total.labels(**self.labels).inc(1)
605
+
308
606
 
309
607
  class TokenizerMetricsCollector:
310
608
  def __init__(
311
609
  self,
312
- labels: Dict[str, str],
610
+ server_args: Optional[ServerArgs] = None,
611
+ labels: Dict[str, str] = None,
313
612
  bucket_time_to_first_token: Optional[List[float]] = None,
314
613
  bucket_inter_token_latency: Optional[List[float]] = None,
315
614
  bucket_e2e_request_latency: Optional[List[float]] = None,
@@ -318,7 +617,7 @@ class TokenizerMetricsCollector:
318
617
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
319
618
  from prometheus_client import Counter, Histogram
320
619
 
321
- self.labels = labels
620
+ self.labels = labels or {}
322
621
  self.collect_tokens_histogram = collect_tokens_histogram
323
622
 
324
623
  self.prompt_tokens_total = Counter(
@@ -334,7 +633,7 @@ class TokenizerMetricsCollector:
334
633
  )
335
634
 
336
635
  if collect_tokens_histogram:
337
- bucket_prompt_tokens = [
636
+ default_bucket_prompt_tokens = [
338
637
  100,
339
638
  300,
340
639
  500,
@@ -358,39 +657,30 @@ class TokenizerMetricsCollector:
358
657
  30000,
359
658
  35000,
360
659
  40000,
660
+ 66000,
661
+ 99000,
662
+ 132000,
663
+ 300000,
664
+ 600000,
665
+ 900000,
666
+ 1100000,
361
667
  ]
362
668
  self.prompt_tokens_histogram = Histogram(
363
669
  name="sglang:prompt_tokens_histogram",
364
670
  documentation="Histogram of prompt token length.",
365
671
  labelnames=labels.keys(),
366
- buckets=bucket_prompt_tokens,
672
+ buckets=generate_buckets(
673
+ server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
674
+ ),
367
675
  )
368
- bucket_generation_tokens = [
369
- 100,
370
- 300,
371
- 500,
372
- 1000,
373
- 1200,
374
- 1500,
375
- 1700,
376
- 2000,
377
- 2500,
378
- 3000,
379
- 3500,
380
- 4000,
381
- 4500,
382
- 5000,
383
- 6000,
384
- 7000,
385
- 8000,
386
- 9000,
387
- 10000,
388
- ]
389
676
  self.generation_tokens_histogram = Histogram(
390
677
  name="sglang:generation_tokens_histogram",
391
678
  documentation="Histogram of generation token length.",
392
679
  labelnames=labels.keys(),
393
- buckets=bucket_generation_tokens,
680
+ buckets=generate_buckets(
681
+ server_args.generation_tokens_buckets,
682
+ default_bucket_prompt_tokens,
683
+ ),
394
684
  )
395
685
 
396
686
  self.cached_tokens_total = Counter(
@@ -459,7 +749,10 @@ class TokenizerMetricsCollector:
459
749
  100,
460
750
  200,
461
751
  400,
462
- 800,
752
+ 600,
753
+ 1200,
754
+ 1800,
755
+ 2400,
463
756
  ]
464
757
 
465
758
  if bucket_inter_token_latency is None:
@@ -510,6 +803,14 @@ class TokenizerMetricsCollector:
510
803
  buckets=bucket_e2e_request_latency,
511
804
  )
512
805
 
806
+ # Offline batch specific TTFB histogram
807
+ self.histogram_time_to_first_token_offline_batch = Histogram(
808
+ name="sglang:time_to_first_token_seconds_offline_batch",
809
+ documentation="Histogram of time to first token in seconds for offline batch requests.",
810
+ labelnames=labels.keys(),
811
+ buckets=bucket_time_to_first_token,
812
+ )
813
+
513
814
  def _log_histogram(self, histogram, data: Union[int, float]) -> None:
514
815
  histogram.labels(**self.labels).observe(data)
515
816
 
@@ -533,8 +834,26 @@ class TokenizerMetricsCollector:
533
834
  self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
534
835
  self._log_histogram(self.generation_tokens_histogram, generation_tokens)
535
836
 
536
- def observe_time_to_first_token(self, value: float):
537
- self.histogram_time_to_first_token.labels(**self.labels).observe(value)
837
+ def observe_time_to_first_token(self, value: float, label: str = ""):
838
+ if label == "batch":
839
+ self.histogram_time_to_first_token_offline_batch.labels(
840
+ **self.labels
841
+ ).observe(value)
842
+ else:
843
+ self.histogram_time_to_first_token.labels(**self.labels).observe(value)
844
+
845
+ def check_time_to_first_token_straggler(self, value: float) -> bool:
846
+ his = self.histogram_time_to_first_token.labels(**self.labels)
847
+ total_observations = sum(bucket._value for bucket in his._buckets)
848
+ if total_observations < 100:
849
+ return False
850
+ p99_threshold = total_observations * 0.99
851
+ cumulative_count = 0
852
+ for i, bucket in enumerate(his._buckets):
853
+ cumulative_count += bucket._value
854
+ if cumulative_count > p99_threshold:
855
+ return value >= his._upper_bounds[i]
856
+ return False
538
857
 
539
858
  def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
540
859
  adjusted_interval = internval / num_new_tokens
@@ -551,3 +870,105 @@ class TokenizerMetricsCollector:
551
870
 
552
871
  def observe_one_aborted_request(self):
553
872
  self.num_aborted_requests_total.labels(**self.labels).inc(1)
873
+
874
+
875
+ @dataclass
876
+ class StorageMetrics:
877
+ prefetch_pgs: List[int] = field(default_factory=list)
878
+ backup_pgs: List[int] = field(default_factory=list)
879
+ prefetch_bandwidth: List[float] = field(default_factory=list)
880
+ backup_bandwidth: List[float] = field(default_factory=list)
881
+
882
+
883
+ class StorageMetricsCollector:
884
+ def __init__(
885
+ self,
886
+ labels: Dict[str, str],
887
+ ):
888
+ from prometheus_client import Counter, Histogram
889
+
890
+ self.labels = labels
891
+
892
+ self.prefetched_tokens_total = Counter(
893
+ name="sglang:prefetched_tokens_total",
894
+ documentation="Number of prefetched prompt tokens.",
895
+ labelnames=labels.keys(),
896
+ )
897
+
898
+ self.backuped_tokens_total = Counter(
899
+ name="sglang:backuped_tokens_total",
900
+ documentation="Number of backuped tokens.",
901
+ labelnames=labels.keys(),
902
+ )
903
+
904
+ bucket_io = [
905
+ 1,
906
+ 5,
907
+ 10,
908
+ 50,
909
+ 100,
910
+ ]
911
+
912
+ bucket_bandwidth = [
913
+ 0.1,
914
+ 0.5,
915
+ 1,
916
+ 5,
917
+ 10,
918
+ 50,
919
+ 100,
920
+ ]
921
+
922
+ self.histogram_prefetch_pgs = Histogram(
923
+ name="sglang:prefetch_pgs",
924
+ documentation="Histogram of prefetch pages of batches.",
925
+ labelnames=labels.keys(),
926
+ buckets=bucket_io,
927
+ )
928
+
929
+ self.histogram_backup_pgs = Histogram(
930
+ name="sglang:backup_pgs",
931
+ documentation="Histogram of backup pages of batches.",
932
+ labelnames=labels.keys(),
933
+ buckets=bucket_io,
934
+ )
935
+
936
+ self.histogram_prefetch_bandwidth = Histogram(
937
+ name="sglang:prefetch_bandwidth",
938
+ documentation="Histogram of prefetch bandwidth in GB/s.",
939
+ labelnames=labels.keys(),
940
+ buckets=bucket_bandwidth,
941
+ )
942
+
943
+ self.histogram_backup_bandwidth = Histogram(
944
+ name="sglang:backup_bandwidth",
945
+ documentation="Histogram of backup bandwidth in GB/s.",
946
+ labelnames=labels.keys(),
947
+ buckets=bucket_bandwidth,
948
+ )
949
+
950
+ def log_prefetched_tokens(self, prefetched_tokens: int):
951
+ if prefetched_tokens > 0:
952
+ self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
953
+
954
+ def log_backuped_tokens(self, backuped_tokens: int):
955
+ if backuped_tokens > 0:
956
+ self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
957
+
958
+ def _log_histogram(self, histogram, data: Union[int, float]):
959
+ histogram.labels(**self.labels).observe(data)
960
+
961
+ def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
962
+ if storage_metrics is None:
963
+ return
964
+
965
+ assert isinstance(storage_metrics, StorageMetrics)
966
+
967
+ for v in storage_metrics.prefetch_pgs:
968
+ self._log_histogram(self.histogram_prefetch_pgs, v)
969
+ for v in storage_metrics.backup_pgs:
970
+ self._log_histogram(self.histogram_backup_pgs, v)
971
+ for v in storage_metrics.prefetch_bandwidth:
972
+ self._log_histogram(self.histogram_prefetch_bandwidth, v)
973
+ for v in storage_metrics.backup_bandwidth:
974
+ self._log_histogram(self.histogram_backup_bandwidth, v)