sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/lang/interpreter.py +1 -1
  4. sglang/srt/configs/__init__.py +4 -0
  5. sglang/srt/configs/device_config.py +3 -1
  6. sglang/srt/configs/dots_vlm.py +139 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/load_config.py +1 -0
  9. sglang/srt/configs/model_config.py +50 -6
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +8 -1
  12. sglang/srt/connector/remote_instance.py +82 -0
  13. sglang/srt/constrained/base_grammar_backend.py +48 -12
  14. sglang/srt/constrained/llguidance_backend.py +0 -1
  15. sglang/srt/constrained/outlines_backend.py +0 -1
  16. sglang/srt/constrained/xgrammar_backend.py +28 -9
  17. sglang/srt/custom_op.py +11 -1
  18. sglang/srt/debug_utils/dump_comparator.py +81 -44
  19. sglang/srt/debug_utils/dump_loader.py +97 -0
  20. sglang/srt/debug_utils/dumper.py +11 -3
  21. sglang/srt/debug_utils/text_comparator.py +73 -11
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +21 -10
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  26. sglang/srt/disaggregation/fake/conn.py +1 -1
  27. sglang/srt/disaggregation/mini_lb.py +6 -445
  28. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  29. sglang/srt/disaggregation/nixl/conn.py +180 -16
  30. sglang/srt/disaggregation/prefill.py +5 -3
  31. sglang/srt/disaggregation/utils.py +5 -50
  32. sglang/srt/distributed/parallel_state.py +67 -43
  33. sglang/srt/entrypoints/engine.py +38 -17
  34. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  35. sglang/srt/entrypoints/grpc_server.py +680 -0
  36. sglang/srt/entrypoints/http_server.py +88 -53
  37. sglang/srt/entrypoints/openai/protocol.py +7 -4
  38. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  39. sglang/srt/entrypoints/openai/serving_chat.py +39 -19
  40. sglang/srt/entrypoints/openai/serving_completions.py +15 -4
  41. sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
  42. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  43. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  44. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  45. sglang/srt/eplb/eplb_manager.py +2 -2
  46. sglang/srt/eplb/expert_distribution.py +26 -13
  47. sglang/srt/eplb/expert_location.py +8 -3
  48. sglang/srt/eplb/expert_location_updater.py +1 -1
  49. sglang/srt/function_call/base_format_detector.py +3 -6
  50. sglang/srt/function_call/ebnf_composer.py +11 -9
  51. sglang/srt/function_call/function_call_parser.py +6 -0
  52. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  53. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  54. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  55. sglang/srt/grpc/__init__.py +1 -0
  56. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  57. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  58. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  59. sglang/srt/hf_transformers_utils.py +4 -0
  60. sglang/srt/layers/activation.py +142 -9
  61. sglang/srt/layers/attention/aiter_backend.py +93 -68
  62. sglang/srt/layers/attention/ascend_backend.py +11 -4
  63. sglang/srt/layers/attention/fla/chunk.py +242 -0
  64. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  65. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  66. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  67. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  68. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  69. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  70. sglang/srt/layers/attention/fla/index.py +37 -0
  71. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  72. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  73. sglang/srt/layers/attention/fla/op.py +66 -0
  74. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  75. sglang/srt/layers/attention/fla/utils.py +331 -0
  76. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  77. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  78. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  79. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  80. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  81. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  82. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  83. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  84. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  85. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  86. sglang/srt/layers/attention/triton_backend.py +18 -1
  87. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  88. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  89. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  90. sglang/srt/layers/communicator.py +45 -7
  91. sglang/srt/layers/dp_attention.py +30 -1
  92. sglang/srt/layers/layernorm.py +32 -15
  93. sglang/srt/layers/linear.py +34 -3
  94. sglang/srt/layers/logits_processor.py +29 -10
  95. sglang/srt/layers/moe/__init__.py +2 -1
  96. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  97. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  98. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  99. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  100. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  107. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  108. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  113. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  114. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  115. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  116. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  117. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  118. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  119. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  120. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  121. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  122. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  123. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  124. sglang/srt/layers/moe/topk.py +30 -9
  125. sglang/srt/layers/moe/utils.py +12 -7
  126. sglang/srt/layers/quantization/awq.py +19 -7
  127. sglang/srt/layers/quantization/base_config.py +11 -6
  128. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  129. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  130. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  131. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  132. sglang/srt/layers/quantization/fp8.py +76 -47
  133. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  134. sglang/srt/layers/quantization/gptq.py +25 -17
  135. sglang/srt/layers/quantization/modelopt_quant.py +182 -49
  136. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  137. sglang/srt/layers/quantization/mxfp4.py +68 -41
  138. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  139. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  140. sglang/srt/layers/quantization/quark/utils.py +97 -0
  141. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  142. sglang/srt/layers/quantization/unquant.py +135 -47
  143. sglang/srt/layers/quantization/w4afp8.py +30 -17
  144. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  145. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  146. sglang/srt/layers/rocm_linear_utils.py +44 -0
  147. sglang/srt/layers/rotary_embedding.py +0 -18
  148. sglang/srt/layers/sampler.py +162 -18
  149. sglang/srt/lora/backend/base_backend.py +50 -8
  150. sglang/srt/lora/backend/triton_backend.py +90 -2
  151. sglang/srt/lora/layers.py +32 -0
  152. sglang/srt/lora/lora.py +4 -1
  153. sglang/srt/lora/lora_manager.py +35 -112
  154. sglang/srt/lora/mem_pool.py +24 -10
  155. sglang/srt/lora/utils.py +18 -9
  156. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  157. sglang/srt/managers/cache_controller.py +200 -199
  158. sglang/srt/managers/data_parallel_controller.py +105 -35
  159. sglang/srt/managers/detokenizer_manager.py +8 -4
  160. sglang/srt/managers/disagg_service.py +46 -0
  161. sglang/srt/managers/io_struct.py +199 -12
  162. sglang/srt/managers/mm_utils.py +1 -0
  163. sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
  164. sglang/srt/managers/schedule_batch.py +77 -56
  165. sglang/srt/managers/schedule_policy.py +4 -3
  166. sglang/srt/managers/scheduler.py +191 -139
  167. sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
  168. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  169. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  170. sglang/srt/managers/template_manager.py +3 -3
  171. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  172. sglang/srt/managers/tokenizer_manager.py +260 -519
  173. sglang/srt/managers/tp_worker.py +53 -4
  174. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  175. sglang/srt/mem_cache/allocator.py +1 -1
  176. sglang/srt/mem_cache/hicache_storage.py +18 -33
  177. sglang/srt/mem_cache/hiradix_cache.py +108 -48
  178. sglang/srt/mem_cache/memory_pool.py +347 -48
  179. sglang/srt/mem_cache/memory_pool_host.py +121 -57
  180. sglang/srt/mem_cache/radix_cache.py +0 -2
  181. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  182. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  183. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
  184. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  185. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  186. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
  187. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  188. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  189. sglang/srt/metrics/collector.py +502 -77
  190. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  191. sglang/srt/metrics/utils.py +48 -0
  192. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  193. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  194. sglang/srt/model_executor/forward_batch_info.py +75 -19
  195. sglang/srt/model_executor/model_runner.py +357 -30
  196. sglang/srt/model_loader/__init__.py +9 -3
  197. sglang/srt/model_loader/loader.py +128 -4
  198. sglang/srt/model_loader/weight_utils.py +2 -1
  199. sglang/srt/models/apertus.py +686 -0
  200. sglang/srt/models/bailing_moe.py +798 -218
  201. sglang/srt/models/bailing_moe_nextn.py +168 -0
  202. sglang/srt/models/deepseek_v2.py +346 -48
  203. sglang/srt/models/dots_vlm.py +174 -0
  204. sglang/srt/models/dots_vlm_vit.py +337 -0
  205. sglang/srt/models/ernie4.py +1 -1
  206. sglang/srt/models/gemma3n_mm.py +1 -1
  207. sglang/srt/models/glm4_moe.py +11 -2
  208. sglang/srt/models/glm4v.py +4 -2
  209. sglang/srt/models/glm4v_moe.py +3 -0
  210. sglang/srt/models/gpt_oss.py +1 -1
  211. sglang/srt/models/internvl.py +28 -0
  212. sglang/srt/models/llama4.py +9 -0
  213. sglang/srt/models/llama_eagle3.py +13 -0
  214. sglang/srt/models/longcat_flash.py +2 -2
  215. sglang/srt/models/minicpmv.py +165 -3
  216. sglang/srt/models/mllama4.py +25 -0
  217. sglang/srt/models/opt.py +637 -0
  218. sglang/srt/models/qwen2.py +7 -0
  219. sglang/srt/models/qwen2_5_vl.py +27 -3
  220. sglang/srt/models/qwen2_moe.py +60 -13
  221. sglang/srt/models/qwen3.py +8 -2
  222. sglang/srt/models/qwen3_moe.py +40 -9
  223. sglang/srt/models/qwen3_next.py +1042 -0
  224. sglang/srt/models/qwen3_next_mtp.py +112 -0
  225. sglang/srt/models/step3_vl.py +1 -1
  226. sglang/srt/models/torch_native_llama.py +1 -1
  227. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  228. sglang/srt/multimodal/processors/glm4v.py +9 -9
  229. sglang/srt/multimodal/processors/internvl.py +141 -129
  230. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  231. sglang/srt/offloader.py +27 -3
  232. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  233. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  234. sglang/srt/sampling/sampling_batch_info.py +18 -15
  235. sglang/srt/server_args.py +355 -37
  236. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  237. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  238. sglang/srt/speculative/eagle_utils.py +0 -2
  239. sglang/srt/speculative/eagle_worker.py +197 -112
  240. sglang/srt/speculative/spec_info.py +5 -0
  241. sglang/srt/speculative/standalone_worker.py +109 -0
  242. sglang/srt/tracing/trace.py +552 -0
  243. sglang/srt/utils.py +46 -3
  244. sglang/srt/weight_sync/utils.py +1 -1
  245. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  246. sglang/test/few_shot_gsm8k.py +1 -0
  247. sglang/test/runners.py +4 -0
  248. sglang/test/test_cutlass_moe.py +24 -6
  249. sglang/test/test_disaggregation_utils.py +66 -0
  250. sglang/test/test_fp4_moe.py +370 -1
  251. sglang/test/test_utils.py +28 -1
  252. sglang/utils.py +12 -0
  253. sglang/version.py +1 -1
  254. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  255. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
  256. sglang/srt/disaggregation/launch_lb.py +0 -118
  257. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  258. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  259. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  260. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  261. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  262. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  263. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  264. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  265. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,552 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ """package for sglang requests tracing"""
15
+
16
+ from __future__ import annotations
17
+
18
+ import ctypes
19
+ import logging
20
+ import os
21
+ import random
22
+ import threading
23
+ import time
24
+ import uuid
25
+ from dataclasses import dataclass
26
+ from typing import Any, Dict, List, Optional
27
+
28
+ logger = logging.getLogger(__name__)
29
+ opentelemetry_imported = False
30
+ tracing_enabled = False
31
+
32
+ try:
33
+ from opentelemetry import context, propagate, trace
34
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
35
+ from opentelemetry.sdk.resources import SERVICE_NAME, Resource
36
+ from opentelemetry.sdk.trace import TracerProvider, id_generator
37
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
38
+
39
+ opentelemetry_imported = True
40
+ except ImportError:
41
+
42
+ class id_generator:
43
+ class IdGenerator:
44
+ pass
45
+
46
+ logger.info("opentelemetry package is not installed, tracing disabled")
47
+
48
+
49
+ @dataclass
50
+ class SglangTraceThreadInfo:
51
+ host_id: str
52
+ pid: int
53
+ thread_label: str
54
+ tp_rank: int
55
+ dp_rank: int
56
+ tracer: trace.Tracer
57
+
58
+
59
+ @dataclass
60
+ class SglangTraceSliceContext:
61
+ slice_name: str
62
+ span: Optional[trace.span.Span] = None
63
+ # When True, defers slice_name assignment until trace_slice_end()
64
+ anonymous: bool = False
65
+
66
+
67
+ @dataclass
68
+ class SglangTraceThreadContext:
69
+ thread_info: SglangTraceThreadInfo
70
+ cur_slice_stack: List[SglangTraceSliceContext]
71
+ thread_span: Optional[trace.span.Span] = None
72
+ # Record the most recently completed span as the previous span for the next span to be created.
73
+ last_span_context: Optional[trace.span.SpanContext] = None
74
+
75
+
76
+ @dataclass
77
+ class SglangTraceReqContext:
78
+ rid: str
79
+ start_time_ns: int
80
+ threads_context: Dict[int, SglangTraceThreadContext]
81
+ bootstrap_room: Optional[int] = None
82
+
83
+ # Indicates whether this instance is a replica from the main process.
84
+ # When True, root_span is None and only root_span_context is preserved.
85
+ is_copy: bool = False
86
+ root_span: Optional[trace.span.Span] = None
87
+ root_span_context: Optional[context.Context] = None
88
+
89
+
90
+ @dataclass
91
+ class SglangTracePropagateContext:
92
+ root_span_context: context.Context
93
+ prev_span_context: Optional[trace.span.SpanContext]
94
+
95
+ def to_dict(self):
96
+ carrier: dict[str, str] = {}
97
+ context.attach(self.root_span_context)
98
+ propagate.inject(carrier)
99
+
100
+ if self.prev_span_context:
101
+ return {
102
+ "root_span": carrier,
103
+ "prev_span": {
104
+ "span_id": self.prev_span_context.span_id,
105
+ "trace_id": self.prev_span_context.trace_id,
106
+ },
107
+ }
108
+ else:
109
+ return {"root_span": carrier, "prev_span": "None"}
110
+
111
+ @classmethod
112
+ def instance_from_dict(cls, d):
113
+ if "root_span" not in d or "prev_span" not in d:
114
+ return None
115
+
116
+ carrier = d["root_span"]
117
+ root_span_context = propagate.extract(carrier)
118
+
119
+ if d["prev_span"] == "None":
120
+ prev_span_context = None
121
+ else:
122
+ prev_span_context = trace.span.SpanContext(
123
+ trace_id=d["prev_span"]["trace_id"],
124
+ span_id=d["prev_span"]["span_id"],
125
+ is_remote=True,
126
+ )
127
+
128
+ return cls(root_span_context, prev_span_context)
129
+
130
+
131
+ class SglangTraceCustomIdGenerator(id_generator.IdGenerator):
132
+ """
133
+ The default IdGenerator may produce duplicate trace IDs across multiple TP scheduler processes,
134
+ hence a custom IdGenerator is implemented.
135
+ """
136
+
137
+ def __init__(self):
138
+ super().__init__()
139
+ self.local_random = random.Random()
140
+ self.local_random.seed(time.time())
141
+
142
+ def generate_trace_id(self) -> int:
143
+ return self.local_random.getrandbits(64)
144
+
145
+ def generate_span_id(self) -> int:
146
+ return self.local_random.getrandbits(64)
147
+
148
+
149
+ # global variables
150
+ threads_info: Dict[int, SglangTraceThreadInfo] = {}
151
+ reqs_context: Dict[str, SglangTraceReqContext] = {}
152
+
153
+ __get_cur_time_ns = lambda: int(time.time() * 1e9)
154
+
155
+
156
+ def __get_host_id() -> str:
157
+ """
158
+ In distributed tracing systems, obtain a unique node identifier
159
+ and inject it into all subsequently generated spans
160
+ to prevent PID conflicts between threads on different nodes.
161
+ """
162
+ if os.path.exists("/etc/machine-id"):
163
+ try:
164
+ with open("/etc/machine-id", "r") as f:
165
+ return f.read().strip()
166
+ except:
167
+ pass
168
+
169
+ mac = uuid.getnode()
170
+ if mac != 0:
171
+ return uuid.UUID(int=mac).hex
172
+
173
+ return "unknown"
174
+
175
+
176
+ # Should be called by each tracked process.
177
+ def process_tracing_init(otlp_endpoint, server_name):
178
+ global tracing_enabled
179
+ global __get_cur_time_ns
180
+ if not opentelemetry_imported:
181
+ tracing_enabled = False
182
+ return
183
+
184
+ try:
185
+ resource = Resource.create(
186
+ attributes={
187
+ SERVICE_NAME: server_name,
188
+ }
189
+ )
190
+ tracer_provider = TracerProvider(
191
+ resource=resource, id_generator=SglangTraceCustomIdGenerator()
192
+ )
193
+
194
+ processor = BatchSpanProcessor(
195
+ OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
196
+ )
197
+ tracer_provider.add_span_processor(processor)
198
+ trace.set_tracer_provider(tracer_provider)
199
+ except Exception as e:
200
+ logger.error(f": initialize opentelemetry error:{e}")
201
+ logger.warning("pelease set correct otlp endpoint")
202
+ tracing_enabled = False
203
+ return
204
+
205
+ if hasattr(time, "time_ns"):
206
+ __get_cur_time_ns = lambda: int(time.time_ns())
207
+
208
+ tracing_enabled = True
209
+
210
+
211
+ # Should be called by each tracked thread.
212
+ def trace_set_thread_info(
213
+ thread_label: str, tp_rank: Optional[int] = None, dp_rank: Optional[int] = None
214
+ ):
215
+ if not tracing_enabled:
216
+ return
217
+
218
+ pid = threading.get_native_id()
219
+ if pid in threads_info:
220
+ return
221
+
222
+ threads_info[pid] = SglangTraceThreadInfo(
223
+ host_id=__get_host_id(),
224
+ pid=pid,
225
+ thread_label=thread_label,
226
+ tp_rank=tp_rank,
227
+ dp_rank=dp_rank,
228
+ tracer=trace.get_tracer("sglang server"),
229
+ )
230
+
231
+
232
+ def __create_thread_context(pid, req_span_context, ts: Optional[int] = None):
233
+ if pid not in threads_info:
234
+ trace_set_thread_info("unknown")
235
+
236
+ thread_info = threads_info[pid]
237
+ thread_context = SglangTraceThreadContext(
238
+ thread_info=thread_info,
239
+ cur_slice_stack=[],
240
+ )
241
+
242
+ thread_name = f"{thread_info.thread_label}"
243
+ if thread_info.tp_rank is not None:
244
+ thread_name += f" [TP {thread_info.tp_rank}] "
245
+ thread_name += f"(host:{thread_info.host_id[:8]} | pid:{pid})"
246
+ ts = ts or __get_cur_time_ns()
247
+ thread_context.thread_span = thread_context.thread_info.tracer.start_span(
248
+ name=thread_name,
249
+ start_time=ts,
250
+ context=req_span_context,
251
+ )
252
+
253
+ if thread_info.tp_rank is not None:
254
+ thread_context.thread_span.set_attributes({"tp_rank": thread_info.tp_rank})
255
+
256
+ thread_context.thread_span.set_attributes(
257
+ {
258
+ "host_id": thread_info.host_id,
259
+ "pid": thread_info.pid,
260
+ "thread_label": thread_info.thread_label,
261
+ }
262
+ )
263
+
264
+ return thread_context
265
+
266
+
267
+ def trace_get_proc_propagate_context(rid) -> Optional[Dict[str, Any]]:
268
+ if not tracing_enabled:
269
+ return None
270
+
271
+ rid = str(rid)
272
+ if rid not in reqs_context or not reqs_context[rid].root_span_context:
273
+ return None
274
+
275
+ pid = threading.get_native_id()
276
+ prev_span_context = None
277
+ thread_context = reqs_context[rid].threads_context[pid]
278
+ if thread_context.cur_slice_stack:
279
+ cur_slice_info = thread_context.cur_slice_stack[0]
280
+ prev_span_context = cur_slice_info.span.get_span_context()
281
+ elif thread_context.last_span_context:
282
+ prev_span_context = thread_context.last_span_context
283
+
284
+ trace_context = SglangTracePropagateContext(
285
+ reqs_context[rid].root_span_context, prev_span_context
286
+ )
287
+ return trace_context.to_dict()
288
+
289
+
290
+ def trace_set_proc_propagate_context(rid, trace_context: Optional[Dict[str, Any]]):
291
+ if not tracing_enabled:
292
+ return
293
+ if not trace_context:
294
+ return
295
+
296
+ trace_context = SglangTracePropagateContext.instance_from_dict(trace_context)
297
+ if not trace_context:
298
+ return
299
+
300
+ rid = str(rid)
301
+ # Create a copy of the request context
302
+ if rid not in reqs_context:
303
+ reqs_context[rid] = SglangTraceReqContext(
304
+ rid=rid,
305
+ start_time_ns=__get_cur_time_ns(),
306
+ threads_context={},
307
+ root_span_context=trace_context.root_span_context,
308
+ is_copy=True,
309
+ )
310
+
311
+ pid = threading.get_native_id()
312
+
313
+ if pid in reqs_context[rid].threads_context:
314
+ return
315
+
316
+ # Create new thread context.
317
+ reqs_context[rid].threads_context[pid] = __create_thread_context(
318
+ pid,
319
+ trace_context.root_span_context,
320
+ reqs_context[rid].start_time_ns,
321
+ )
322
+
323
+ reqs_context[rid].threads_context[
324
+ pid
325
+ ].last_span_context = trace_context.prev_span_context
326
+
327
+
328
+ def trace_req_start(
329
+ rid: str,
330
+ bootstrap_room: Optional[int] = None,
331
+ ts: Optional[int] = None,
332
+ ):
333
+ if not tracing_enabled:
334
+ return
335
+
336
+ rid = str(rid)
337
+
338
+ ts = ts or __get_cur_time_ns()
339
+
340
+ pid = threading.get_native_id()
341
+ if pid not in threads_info:
342
+ return
343
+
344
+ # create req context and root span
345
+ reqs_context[rid] = SglangTraceReqContext(
346
+ rid=rid,
347
+ start_time_ns=ts,
348
+ threads_context={},
349
+ bootstrap_room=bootstrap_room,
350
+ is_copy=False,
351
+ )
352
+
353
+ # Drop the worker_id added by MultiTokenizer
354
+ orig_rid = rid.split("_")[-1]
355
+ tracer = threads_info[pid].tracer
356
+ root_span = tracer.start_span(
357
+ name=f"Req {orig_rid[:8]}",
358
+ start_time=ts,
359
+ )
360
+
361
+ root_span.set_attributes(
362
+ {
363
+ "rid": rid,
364
+ "bootstrap_room": bootstrap_room if bootstrap_room else "None",
365
+ }
366
+ )
367
+
368
+ reqs_context[rid].root_span = root_span
369
+ reqs_context[rid].root_span_context = trace.set_span_in_context(root_span)
370
+
371
+ # create thread context and thread span
372
+ reqs_context[rid].threads_context[pid] = __create_thread_context(
373
+ pid,
374
+ reqs_context[rid].root_span_context,
375
+ ts,
376
+ )
377
+
378
+
379
+ def trace_req_finish(
380
+ rid: str, ts: Optional[int] = None, attrs: Optional[Dict[str, Any]] = None
381
+ ):
382
+ if not tracing_enabled:
383
+ return
384
+
385
+ rid = str(rid)
386
+ if rid not in reqs_context:
387
+ return
388
+
389
+ req_context = reqs_context[rid]
390
+ ts = ts or __get_cur_time_ns()
391
+
392
+ # End all unclosed thread spans.
393
+ for thread_context in req_context.threads_context.values():
394
+ thread_context.thread_span.end(end_time=ts)
395
+
396
+ if attrs:
397
+ req_context.root_span.set_attributes(attrs)
398
+
399
+ req_context.root_span.end(end_time=ts)
400
+
401
+ del reqs_context[rid]
402
+
403
+
404
+ def trace_slice_start(
405
+ name: str,
406
+ rid: str,
407
+ ts: Optional[int] = None,
408
+ anonymous: bool = False,
409
+ ):
410
+
411
+ rid = str(rid)
412
+ if not tracing_enabled or rid not in reqs_context:
413
+ return
414
+
415
+ pid = threading.get_native_id()
416
+ if pid not in reqs_context[rid].threads_context:
417
+ return
418
+
419
+ thread_context = reqs_context[rid].threads_context[pid]
420
+
421
+ ts = ts or __get_cur_time_ns()
422
+
423
+ slice_info = SglangTraceSliceContext(
424
+ slice_name=name,
425
+ anonymous=anonymous,
426
+ )
427
+
428
+ # find prev slice
429
+ prev_span_context = None
430
+ if not thread_context.cur_slice_stack:
431
+ if thread_context.last_span_context:
432
+ prev_span_context = thread_context.last_span_context
433
+
434
+ parent_span = thread_context.thread_span
435
+ if thread_context.cur_slice_stack:
436
+ parent_span = thread_context.cur_slice_stack[-1].span
437
+
438
+ parent_span_context = trace.set_span_in_context(parent_span)
439
+ span = thread_context.thread_info.tracer.start_span(
440
+ name=slice_info.slice_name,
441
+ start_time=ts,
442
+ context=parent_span_context,
443
+ )
444
+
445
+ if prev_span_context:
446
+ span.add_link(prev_span_context)
447
+
448
+ slice_info.span = span
449
+
450
+ thread_context.cur_slice_stack.append(slice_info)
451
+
452
+
453
+ def trace_slice_end(
454
+ name: str,
455
+ rid: str,
456
+ ts: Optional[int] = None,
457
+ attrs: Optional[Dict[str, Any]] = None,
458
+ auto_next_anon: bool = False,
459
+ thread_finish_flag: bool = False,
460
+ ):
461
+ rid = str(rid)
462
+ if not tracing_enabled or rid not in reqs_context:
463
+ return
464
+
465
+ pid = threading.get_native_id()
466
+ if pid not in reqs_context[rid].threads_context:
467
+ return
468
+
469
+ thread_context = reqs_context[rid].threads_context[pid]
470
+
471
+ if not thread_context.cur_slice_stack:
472
+ logger.warning(f"No matching with the SLICE_START event{name} is required.")
473
+ return
474
+
475
+ ts = ts or __get_cur_time_ns()
476
+ slice_info = thread_context.cur_slice_stack[-1]
477
+ span = slice_info.span
478
+
479
+ if slice_info.anonymous:
480
+ span.update_name(name)
481
+ else:
482
+ span = slice_info.span
483
+ if slice_info.slice_name != name:
484
+ span.set_status(trace.Status(trace.StatusCode.ERROR))
485
+ logger.warning(f"Slice name mismatch: {name} != {slice_info.slice_name}")
486
+
487
+ if attrs:
488
+ span.set_attributes(attrs)
489
+
490
+ span.end(end_time=ts)
491
+
492
+ thread_context.cur_slice_stack.pop()
493
+ if len(thread_context.cur_slice_stack) == 0:
494
+ thread_context.last_span_context = span.get_span_context()
495
+
496
+ # If this is the last slice in the thread,
497
+ # release the thread context and check whether to release the request context.
498
+ if thread_finish_flag:
499
+ thread_context.thread_span.end(end_time=ts)
500
+ del reqs_context[rid].threads_context[pid]
501
+ if reqs_context[rid].is_copy and not reqs_context[rid].threads_context:
502
+ del reqs_context[rid]
503
+ return
504
+
505
+ if auto_next_anon:
506
+ trace_slice_start("", rid, ts, True)
507
+
508
+
509
+ # alias
510
+ trace_slice = trace_slice_end
511
+
512
+
513
+ # Add event to the current slice on the same thread with the same rid.
514
+ def trace_event(name: str, rid: str, ts: Optional[int] = None):
515
+ if not tracing_enabled or rid not in reqs_context:
516
+ return
517
+
518
+ rid = str(rid)
519
+ pid = threading.get_native_id()
520
+ if pid not in reqs_context[rid].threads_context:
521
+ return
522
+
523
+ thread_context = reqs_context[rid].threads_context[pid]
524
+
525
+ if not thread_context.cur_slice_stack:
526
+ logger.warning(f"No slice is currently being traced.")
527
+ return
528
+
529
+ ts = ts or __get_cur_time_ns()
530
+
531
+ slice_info = thread_context.cur_slice_stack[-1]
532
+ slice_info.span.add_event(name=name, timestamp=ts)
533
+
534
+
535
+ # Add attrs to the current slice on the same thread with the same rid.
536
+ def trace_slice_add_attr(rid: str, attrs: Dict[str, Any]):
537
+ if not tracing_enabled or rid not in reqs_context:
538
+ return
539
+
540
+ rid = str(rid)
541
+ pid = threading.get_native_id()
542
+ if pid not in reqs_context[rid].threads_context:
543
+ return
544
+
545
+ thread_context = reqs_context[rid].threads_context[pid]
546
+
547
+ if not thread_context.cur_slice_stack:
548
+ logger.warning(f"No slice is currently being traced.")
549
+ return
550
+
551
+ slice_info = thread_context.cur_slice_stack[-1]
552
+ slice_info.span.set_attributes(attrs)
sglang/srt/utils.py CHANGED
@@ -15,6 +15,7 @@
15
15
 
16
16
  from __future__ import annotations
17
17
 
18
+ import argparse
18
19
  import asyncio
19
20
  import builtins
20
21
  import ctypes
@@ -230,8 +231,16 @@ except:
230
231
  is_intel_amx_backend_available = False
231
232
 
232
233
 
234
+ try:
235
+ # move torch._C._cpu._is_amx_tile_supported() from cpu_has_amx_support
236
+ # to support torch compile
237
+ is_amx_tile_supported = torch._C._cpu._is_amx_tile_supported()
238
+ except:
239
+ is_amx_tile_supported = False
240
+
241
+
233
242
  def cpu_has_amx_support():
234
- return torch._C._cpu._is_amx_tile_supported() and is_intel_amx_backend_available
243
+ return is_amx_tile_supported and is_intel_amx_backend_available
235
244
 
236
245
 
237
246
  def use_intel_amx_backend(layer):
@@ -426,7 +435,9 @@ def get_available_gpu_memory(
426
435
 
427
436
  elif device == "cpu":
428
437
  # TODO: rename the variables in the current function to be not GPU specific
429
- free_gpu_memory = psutil.virtual_memory().available
438
+ total_free_memory = psutil.virtual_memory().available
439
+ n_numa_node: int = len(get_cpu_ids_by_node())
440
+ free_gpu_memory = round(total_free_memory / n_numa_node, 3)
430
441
  elif device == "npu":
431
442
  num_gpus = torch.npu.device_count()
432
443
  assert gpu_id < num_gpus
@@ -1149,7 +1160,7 @@ def pytorch_profile(name, func, *args, data_size=-1):
1149
1160
 
1150
1161
  def get_zmq_socket(
1151
1162
  context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
1152
- ):
1163
+ ) -> zmq.Socket:
1153
1164
  mem = psutil.virtual_memory()
1154
1165
  total_mem = mem.total / 1024**3
1155
1166
  available_mem = mem.available / 1024**3
@@ -1421,6 +1432,7 @@ def init_custom_process_group(
1421
1432
  store=None,
1422
1433
  group_name=None,
1423
1434
  pg_options=None,
1435
+ device_id=None,
1424
1436
  ):
1425
1437
  from torch.distributed.distributed_c10d import (
1426
1438
  Backend,
@@ -1474,6 +1486,7 @@ def init_custom_process_group(
1474
1486
  group_name=group_name,
1475
1487
  **{pg_options_param_name: pg_options},
1476
1488
  timeout=timeout,
1489
+ device_id=device_id,
1477
1490
  )
1478
1491
 
1479
1492
  _world.pg_group_ranks[pg] = {i: i for i in range(world_size)}
@@ -2900,6 +2913,18 @@ def mxfp_supported():
2900
2913
  return False
2901
2914
 
2902
2915
 
2916
+ @lru_cache(maxsize=1)
2917
+ def is_gfx95_supported():
2918
+ """
2919
+ Returns whether the current platform supports MX types.
2920
+ """
2921
+ if torch.version.hip:
2922
+ gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
2923
+ return any(gfx in gcn_arch for gfx in ["gfx95"])
2924
+ else:
2925
+ return False
2926
+
2927
+
2903
2928
  # LoRA-related constants and utilities
2904
2929
  SUPPORTED_LORA_TARGET_MODULES = [
2905
2930
  "q_proj",
@@ -3015,3 +3040,21 @@ def check_cuda_result(raw_output):
3015
3040
  raise Exception(f"CUDA error: {err}")
3016
3041
 
3017
3042
  return results
3043
+
3044
+
3045
+ def numa_bind_to_node(node: int):
3046
+ libnuma = ctypes.CDLL("libnuma.so")
3047
+ if libnuma.numa_available() < 0:
3048
+ raise SystemError("numa not available on this system")
3049
+
3050
+ libnuma.numa_run_on_node(ctypes.c_int(node))
3051
+ libnuma.numa_set_localalloc()
3052
+
3053
+
3054
+ def json_list_type(value):
3055
+ try:
3056
+ return json.loads(value)
3057
+ except json.JSONDecodeError:
3058
+ raise argparse.ArgumentTypeError(
3059
+ f"Invalid JSON list: {value}. Please provide a valid JSON list."
3060
+ )
@@ -6,7 +6,7 @@ from torch.distributed.device_mesh import DeviceMesh
6
6
  from torch.distributed.tensor import DTensor
7
7
 
8
8
  from sglang.srt.entrypoints.engine import Engine
9
- from sglang.srt.managers.tokenizer_manager import UpdateWeightsFromTensorReqInput
9
+ from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput
10
10
  from sglang.srt.model_executor.model_runner import LocalSerializedTensor
11
11
  from sglang.srt.utils import MultiprocessingSerializer
12
12