sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/device_config.py +3 -1
  5. sglang/srt/configs/dots_vlm.py +139 -0
  6. sglang/srt/configs/load_config.py +1 -0
  7. sglang/srt/configs/model_config.py +50 -6
  8. sglang/srt/configs/qwen3_next.py +326 -0
  9. sglang/srt/connector/__init__.py +8 -1
  10. sglang/srt/connector/remote_instance.py +82 -0
  11. sglang/srt/constrained/base_grammar_backend.py +48 -12
  12. sglang/srt/constrained/llguidance_backend.py +0 -1
  13. sglang/srt/constrained/outlines_backend.py +0 -1
  14. sglang/srt/constrained/xgrammar_backend.py +28 -9
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/base/conn.py +1 -1
  21. sglang/srt/disaggregation/common/conn.py +15 -12
  22. sglang/srt/disaggregation/decode.py +21 -10
  23. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -445
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +5 -3
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +24 -3
  31. sglang/srt/entrypoints/engine.py +38 -17
  32. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  33. sglang/srt/entrypoints/grpc_server.py +680 -0
  34. sglang/srt/entrypoints/http_server.py +85 -54
  35. sglang/srt/entrypoints/openai/protocol.py +4 -1
  36. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  37. sglang/srt/entrypoints/openai/serving_chat.py +36 -16
  38. sglang/srt/entrypoints/openai/serving_completions.py +12 -3
  39. sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
  40. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  41. sglang/srt/entrypoints/openai/serving_responses.py +6 -3
  42. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  43. sglang/srt/eplb/eplb_manager.py +2 -2
  44. sglang/srt/eplb/expert_distribution.py +26 -13
  45. sglang/srt/eplb/expert_location.py +8 -3
  46. sglang/srt/eplb/expert_location_updater.py +1 -1
  47. sglang/srt/function_call/base_format_detector.py +3 -6
  48. sglang/srt/function_call/ebnf_composer.py +11 -9
  49. sglang/srt/function_call/function_call_parser.py +6 -0
  50. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  51. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  52. sglang/srt/grpc/__init__.py +1 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  56. sglang/srt/hf_transformers_utils.py +4 -0
  57. sglang/srt/layers/activation.py +142 -9
  58. sglang/srt/layers/attention/ascend_backend.py +11 -4
  59. sglang/srt/layers/attention/fla/chunk.py +242 -0
  60. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  61. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  62. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  63. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  66. sglang/srt/layers/attention/fla/index.py +37 -0
  67. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  68. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  69. sglang/srt/layers/attention/fla/op.py +66 -0
  70. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  71. sglang/srt/layers/attention/fla/utils.py +331 -0
  72. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  73. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  74. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  75. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  76. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  77. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  78. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  79. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  80. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  81. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  82. sglang/srt/layers/attention/triton_backend.py +18 -1
  83. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  84. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  85. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  86. sglang/srt/layers/dp_attention.py +30 -1
  87. sglang/srt/layers/layernorm.py +32 -15
  88. sglang/srt/layers/linear.py +34 -3
  89. sglang/srt/layers/logits_processor.py +29 -10
  90. sglang/srt/layers/moe/__init__.py +2 -1
  91. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  92. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  93. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  94. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  95. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  96. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  100. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  107. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  108. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  109. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  110. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  111. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  112. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  113. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  114. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  115. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  116. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  117. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  118. sglang/srt/layers/moe/topk.py +30 -9
  119. sglang/srt/layers/moe/utils.py +12 -6
  120. sglang/srt/layers/quantization/awq.py +19 -7
  121. sglang/srt/layers/quantization/base_config.py +11 -6
  122. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  123. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  124. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  125. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  126. sglang/srt/layers/quantization/fp8.py +76 -47
  127. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  128. sglang/srt/layers/quantization/gptq.py +25 -17
  129. sglang/srt/layers/quantization/modelopt_quant.py +147 -47
  130. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  131. sglang/srt/layers/quantization/mxfp4.py +64 -40
  132. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  133. sglang/srt/layers/quantization/unquant.py +135 -47
  134. sglang/srt/layers/quantization/w4afp8.py +30 -17
  135. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  136. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  137. sglang/srt/layers/sampler.py +162 -18
  138. sglang/srt/lora/backend/base_backend.py +50 -8
  139. sglang/srt/lora/backend/triton_backend.py +90 -2
  140. sglang/srt/lora/layers.py +32 -0
  141. sglang/srt/lora/lora.py +4 -1
  142. sglang/srt/lora/lora_manager.py +35 -112
  143. sglang/srt/lora/mem_pool.py +24 -10
  144. sglang/srt/lora/utils.py +18 -9
  145. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  146. sglang/srt/managers/cache_controller.py +158 -160
  147. sglang/srt/managers/data_parallel_controller.py +105 -35
  148. sglang/srt/managers/detokenizer_manager.py +8 -4
  149. sglang/srt/managers/disagg_service.py +46 -0
  150. sglang/srt/managers/io_struct.py +199 -12
  151. sglang/srt/managers/mm_utils.py +1 -0
  152. sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
  153. sglang/srt/managers/schedule_batch.py +77 -56
  154. sglang/srt/managers/schedule_policy.py +1 -1
  155. sglang/srt/managers/scheduler.py +187 -39
  156. sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
  157. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  158. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  159. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  160. sglang/srt/managers/tokenizer_manager.py +259 -519
  161. sglang/srt/managers/tp_worker.py +53 -4
  162. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  163. sglang/srt/mem_cache/hicache_storage.py +3 -23
  164. sglang/srt/mem_cache/hiradix_cache.py +103 -43
  165. sglang/srt/mem_cache/memory_pool.py +347 -48
  166. sglang/srt/mem_cache/memory_pool_host.py +105 -46
  167. sglang/srt/mem_cache/radix_cache.py +0 -2
  168. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  169. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  170. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
  171. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  172. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  173. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
  174. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  175. sglang/srt/metrics/collector.py +493 -76
  176. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  177. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  178. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  179. sglang/srt/model_executor/forward_batch_info.py +59 -2
  180. sglang/srt/model_executor/model_runner.py +356 -29
  181. sglang/srt/model_loader/__init__.py +9 -3
  182. sglang/srt/model_loader/loader.py +128 -4
  183. sglang/srt/model_loader/weight_utils.py +2 -1
  184. sglang/srt/models/apertus.py +686 -0
  185. sglang/srt/models/bailing_moe.py +798 -218
  186. sglang/srt/models/bailing_moe_nextn.py +168 -0
  187. sglang/srt/models/deepseek_v2.py +109 -15
  188. sglang/srt/models/dots_vlm.py +174 -0
  189. sglang/srt/models/dots_vlm_vit.py +337 -0
  190. sglang/srt/models/ernie4.py +1 -1
  191. sglang/srt/models/gemma3n_mm.py +1 -1
  192. sglang/srt/models/glm4_moe.py +1 -1
  193. sglang/srt/models/glm4v.py +4 -2
  194. sglang/srt/models/glm4v_moe.py +3 -0
  195. sglang/srt/models/gpt_oss.py +1 -1
  196. sglang/srt/models/llama4.py +9 -0
  197. sglang/srt/models/llama_eagle3.py +13 -0
  198. sglang/srt/models/longcat_flash.py +2 -2
  199. sglang/srt/models/mllama4.py +25 -0
  200. sglang/srt/models/opt.py +637 -0
  201. sglang/srt/models/qwen2.py +7 -0
  202. sglang/srt/models/qwen2_5_vl.py +27 -3
  203. sglang/srt/models/qwen2_moe.py +56 -12
  204. sglang/srt/models/qwen3_moe.py +1 -1
  205. sglang/srt/models/qwen3_next.py +1042 -0
  206. sglang/srt/models/qwen3_next_mtp.py +112 -0
  207. sglang/srt/models/step3_vl.py +1 -1
  208. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  209. sglang/srt/multimodal/processors/glm4v.py +9 -9
  210. sglang/srt/multimodal/processors/internvl.py +141 -129
  211. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  212. sglang/srt/offloader.py +27 -3
  213. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  214. sglang/srt/sampling/sampling_batch_info.py +18 -15
  215. sglang/srt/server_args.py +276 -35
  216. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  217. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  218. sglang/srt/speculative/eagle_utils.py +0 -2
  219. sglang/srt/speculative/eagle_worker.py +43 -4
  220. sglang/srt/speculative/spec_info.py +5 -0
  221. sglang/srt/speculative/standalone_worker.py +109 -0
  222. sglang/srt/tracing/trace.py +552 -0
  223. sglang/srt/utils.py +34 -3
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  226. sglang/test/runners.py +4 -0
  227. sglang/test/test_cutlass_moe.py +24 -6
  228. sglang/test/test_disaggregation_utils.py +66 -0
  229. sglang/test/test_fp4_moe.py +370 -1
  230. sglang/test/test_utils.py +28 -1
  231. sglang/utils.py +11 -0
  232. sglang/version.py +1 -1
  233. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  234. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
  235. sglang/srt/disaggregation/launch_lb.py +0 -118
  236. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  237. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  238. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -27,10 +27,12 @@ import tempfile
27
27
  import threading
28
28
  import time
29
29
  from http import HTTPStatus
30
- from typing import Any, AsyncIterator, Callable, Dict, List, Optional
30
+ from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
31
31
 
32
32
  import setproctitle
33
33
 
34
+ from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
35
+
34
36
  # Fix a bug of Python threading
35
37
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
36
38
 
@@ -47,11 +49,7 @@ from fastapi.exceptions import RequestValidationError
47
49
  from fastapi.middleware.cors import CORSMiddleware
48
50
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
49
51
 
50
- from sglang.srt.disaggregation.utils import (
51
- FAKE_BOOTSTRAP_HOST,
52
- DisaggregationMode,
53
- register_disaggregation_server,
54
- )
52
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
55
53
  from sglang.srt.entrypoints.engine import _launch_subprocesses
56
54
  from sglang.srt.entrypoints.openai.protocol import (
57
55
  ChatCompletionRequest,
@@ -77,6 +75,7 @@ from sglang.srt.managers.io_struct import (
77
75
  EmbeddingReqInput,
78
76
  GenerateReqInput,
79
77
  GetWeightsByNameReqInput,
78
+ InitWeightsSendGroupForRemoteInstanceReqInput,
80
79
  InitWeightsUpdateGroupReqInput,
81
80
  LoadLoRAAdapterReqInput,
82
81
  OpenSessionReqInput,
@@ -84,6 +83,7 @@ from sglang.srt.managers.io_struct import (
84
83
  ProfileReqInput,
85
84
  ReleaseMemoryOccupationReqInput,
86
85
  ResumeMemoryOccupationReqInput,
86
+ SendWeightsToRemoteInstanceReqInput,
87
87
  SeparateReasoningReqInput,
88
88
  SetInternalStateReq,
89
89
  SlowDownReqInput,
@@ -96,8 +96,9 @@ from sglang.srt.managers.io_struct import (
96
96
  )
97
97
  from sglang.srt.managers.multi_tokenizer_mixin import (
98
98
  MultiTokenizerManager,
99
- deserialize_data,
99
+ MultiTokenizerRouter,
100
100
  get_main_process_id,
101
+ monkey_patch_uvicorn_multiprocessing,
101
102
  read_from_shared_memory,
102
103
  write_data_for_multi_tokenizer,
103
104
  )
@@ -127,7 +128,9 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
127
128
  # Store global states
128
129
  @dataclasses.dataclass
129
130
  class _GlobalState:
130
- tokenizer_manager: TokenizerManager
131
+ tokenizer_manager: Union[
132
+ TokenizerManager, MultiTokenizerRouter, MultiTokenizerManager
133
+ ]
131
134
  template_manager: TemplateManager
132
135
  scheduler_info: Dict
133
136
 
@@ -140,21 +143,6 @@ def set_global_state(global_state: _GlobalState):
140
143
  _global_state = global_state
141
144
 
142
145
 
143
- # Function to set up all middlewares for multi-tokenizer compatibility
144
- def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
145
- """Setup all middlewares for both single and multi-process modes"""
146
- worker_pid = os.getpid()
147
-
148
- if api_key:
149
- add_api_key_middleware(app, api_key)
150
- logger.info(f"Worker {worker_pid} added API key middleware")
151
-
152
- if enable_metrics:
153
- add_prometheus_middleware(app)
154
- enable_func_timer()
155
- logger.info(f"Worker {worker_pid} added prometheus middleware")
156
-
157
-
158
146
  async def init_multi_tokenizer() -> ServerArgs:
159
147
  """Read args information from shm and init tokenizer manager for current process"""
160
148
  pid = os.getpid()
@@ -162,11 +150,15 @@ async def init_multi_tokenizer() -> ServerArgs:
162
150
  logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
163
151
 
164
152
  # Read configuration from shared memory
165
- port_args_data = read_from_shared_memory(f"port_args_{main_pid}")
166
- server_args_data = read_from_shared_memory(f"server_args_{main_pid}")
167
- scheduler_info_data = read_from_shared_memory(f"scheduler_info_{main_pid}")
168
- port_args, server_args = deserialize_data(port_args_data, server_args_data)
169
- scheduler_info = scheduler_info_data
153
+ port_args, server_args, scheduler_info = read_from_shared_memory(
154
+ f"multi_tokenizer_args_{main_pid}"
155
+ )
156
+ server_args: ServerArgs
157
+
158
+ # API key authentication is not supported in multi-tokenizer mode
159
+ assert (
160
+ server_args.api_key is None
161
+ ), "API key is not supported in multi-tokenizer mode"
170
162
 
171
163
  port_args.tokenizer_ipc_name = (
172
164
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
@@ -192,18 +184,29 @@ async def init_multi_tokenizer() -> ServerArgs:
192
184
  scheduler_info=scheduler_info,
193
185
  )
194
186
  )
187
+
188
+ if server_args.enable_trace:
189
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
190
+ if server_args.disaggregation_mode == "null":
191
+ thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
192
+ trace_set_thread_info(thread_label)
193
+
195
194
  return server_args
196
195
 
197
196
 
198
197
  @asynccontextmanager
199
198
  async def lifespan(fast_api_app: FastAPI):
200
- server_args = getattr(fast_api_app, "server_args", None)
201
- if server_args is None:
199
+ if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
202
200
  # Initialize multi-tokenizer support for worker processes
203
- fast_api_app.server_args = await init_multi_tokenizer()
204
- setup_middlewares(
205
- fast_api_app.server_args.api_key, fast_api_app.server_args.enable_metrics
206
- )
201
+ fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
202
+
203
+ # only metrics middleware is supported in multi-tokenizer mode
204
+ worker_pid = os.getpid()
205
+ if fast_api_app.server_args.enable_metrics:
206
+ add_prometheus_middleware(app)
207
+ enable_func_timer()
208
+
209
+ logger.info(f"Worker {worker_pid} added prometheus middleware")
207
210
  fast_api_app.warmup_thread = threading.Thread(
208
211
  target=_wait_and_warmup,
209
212
  args=(
@@ -681,6 +684,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
681
684
  )
682
685
 
683
686
 
687
+ @app.post("/init_weights_send_group_for_remote_instance")
688
+ async def init_weights_send_group_for_remote_instance(
689
+ obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
690
+ ):
691
+ success, message = (
692
+ await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
693
+ obj, request
694
+ )
695
+ )
696
+ content = {"success": success, "message": message}
697
+ if success:
698
+ return ORJSONResponse(content, status_code=200)
699
+ else:
700
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
701
+
702
+
703
+ @app.post("/send_weights_to_remote_instance")
704
+ async def send_weights_to_remote_instance(
705
+ obj: SendWeightsToRemoteInstanceReqInput, request: Request
706
+ ):
707
+ success, message = (
708
+ await _global_state.tokenizer_manager.send_weights_to_remote_instance(
709
+ obj, request
710
+ )
711
+ )
712
+ content = {"success": success, "message": message}
713
+ if success:
714
+ return ORJSONResponse(content, status_code=200)
715
+ else:
716
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
717
+
718
+
684
719
  @app.post("/init_weights_update_group")
685
720
  async def init_weights_update_group(
686
721
  obj: InitWeightsUpdateGroupReqInput, request: Request
@@ -1168,7 +1203,6 @@ def launch_server(
1168
1203
  2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
1169
1204
  """
1170
1205
  if server_args.tokenizer_worker_num > 1:
1171
- setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
1172
1206
  port_args = PortArgs.init_new(server_args)
1173
1207
  port_args.tokenizer_worker_ipc_name = (
1174
1208
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
@@ -1177,11 +1211,16 @@ def launch_server(
1177
1211
  server_args=server_args, port_args=port_args
1178
1212
  )
1179
1213
  else:
1180
- setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
1181
1214
  tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1182
1215
  server_args=server_args,
1183
1216
  )
1184
1217
 
1218
+ if server_args.enable_trace:
1219
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
1220
+ if server_args.disaggregation_mode == "null":
1221
+ thread_label = "Tokenizer"
1222
+ trace_set_thread_info(thread_label)
1223
+
1185
1224
  set_global_state(
1186
1225
  _GlobalState(
1187
1226
  tokenizer_manager=tokenizer_manager,
@@ -1191,12 +1230,10 @@ def launch_server(
1191
1230
  )
1192
1231
 
1193
1232
  if server_args.tokenizer_worker_num > 1:
1194
- port_args_shm, server_args_shm, scheduler_info_shm = (
1195
- write_data_for_multi_tokenizer(
1196
- port_args,
1197
- server_args,
1198
- scheduler_info,
1199
- )
1233
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1234
+ port_args,
1235
+ server_args,
1236
+ scheduler_info,
1200
1237
  )
1201
1238
  else:
1202
1239
  # Add api key authorization
@@ -1233,6 +1270,9 @@ def launch_server(
1233
1270
  "level": "INFO",
1234
1271
  "propagate": False,
1235
1272
  }
1273
+
1274
+ monkey_patch_uvicorn_multiprocessing()
1275
+
1236
1276
  uvicorn.run(
1237
1277
  "sglang.srt.entrypoints.http_server:app",
1238
1278
  host=server_args.host,
@@ -1243,6 +1283,7 @@ def launch_server(
1243
1283
  workers=server_args.tokenizer_worker_num,
1244
1284
  )
1245
1285
  else:
1286
+ app.is_single_tokenizer_mode = True
1246
1287
  uvicorn.run(
1247
1288
  app,
1248
1289
  host=server_args.host,
@@ -1253,10 +1294,8 @@ def launch_server(
1253
1294
  )
1254
1295
  finally:
1255
1296
  if server_args.tokenizer_worker_num > 1:
1256
- port_args_shm.unlink()
1257
- server_args_shm.unlink()
1258
- scheduler_info_shm.unlink()
1259
- _global_state.tokenizer_manager.clear_tokenizer_mapping()
1297
+ multi_tokenizer_args_shm.unlink()
1298
+ _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1260
1299
  else:
1261
1300
  warmup_thread.join()
1262
1301
 
@@ -1405,13 +1444,5 @@ def _wait_and_warmup(
1405
1444
  if server_args.debug_tensor_dump_input_file:
1406
1445
  kill_process_tree(os.getpid())
1407
1446
 
1408
- if server_args.pdlb_url is not None:
1409
- register_disaggregation_server(
1410
- server_args.disaggregation_mode,
1411
- server_args.port,
1412
- server_args.disaggregation_bootstrap_port,
1413
- server_args.pdlb_url,
1414
- )
1415
-
1416
1447
  if launch_callback is not None:
1417
1448
  launch_callback()
@@ -229,6 +229,9 @@ class CompletionRequest(BaseModel):
229
229
  # For request id
230
230
  rid: Optional[Union[List[str], str]] = None
231
231
 
232
+ # For customer metric labels
233
+ customer_labels: Optional[Dict[str, str]] = None
234
+
232
235
  @field_validator("max_tokens")
233
236
  @classmethod
234
237
  def validate_max_tokens_positive(cls, v):
@@ -447,7 +450,7 @@ class ChatCompletionRequest(BaseModel):
447
450
  description="Constrains effort on reasoning for reasoning models. "
448
451
  "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
449
452
  "result in faster responses and fewer tokens used on reasoning in a response. "
450
- "Currently only supported for OpenAI models.",
453
+ "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
451
454
  )
452
455
 
453
456
  @model_validator(mode="before")
@@ -1,15 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import uuid
4
6
  from abc import ABC, abstractmethod
5
- from typing import Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
8
 
7
9
  from fastapi import HTTPException, Request
8
10
  from fastapi.responses import ORJSONResponse, StreamingResponse
9
11
 
10
12
  from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
11
13
  from sglang.srt.managers.io_struct import GenerateReqInput
12
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
14
+ from sglang.srt.server_args import ServerArgs
15
+
16
+ if TYPE_CHECKING:
17
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
13
18
 
14
19
  logger = logging.getLogger(__name__)
15
20
 
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
20
25
 
21
26
  def __init__(self, tokenizer_manager: TokenizerManager):
22
27
  self.tokenizer_manager = tokenizer_manager
28
+ self.allowed_custom_labels = (
29
+ set(
30
+ self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
31
+ )
32
+ if isinstance(self.tokenizer_manager.server_args, ServerArgs)
33
+ and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
34
+ else None
35
+ )
23
36
 
24
37
  async def handle_request(
25
38
  self, request: OpenAIServingRequest, raw_request: Request
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
33
46
 
34
47
  # Convert to internal format
35
48
  adapted_request, processed_request = self._convert_to_internal_request(
36
- request
49
+ request, raw_request
37
50
  )
38
51
 
39
52
  # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
@@ -77,6 +90,7 @@ class OpenAIServingBase(ABC):
77
90
  def _convert_to_internal_request(
78
91
  self,
79
92
  request: OpenAIServingRequest,
93
+ raw_request: Request = None,
80
94
  ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
81
95
  """Convert OpenAI request to internal format"""
82
96
  pass
@@ -150,3 +164,32 @@ class OpenAIServingBase(ABC):
150
164
  code=status_code,
151
165
  )
152
166
  return json.dumps({"error": error.model_dump()})
167
+
168
+ def extract_customer_labels(self, raw_request):
169
+ if (
170
+ not self.allowed_custom_labels
171
+ or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
172
+ ):
173
+ return None
174
+
175
+ customer_labels = None
176
+ header = (
177
+ self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
178
+ )
179
+ try:
180
+ raw_labels = (
181
+ json.loads(raw_request.headers.get(header))
182
+ if raw_request and raw_request.headers.get(header)
183
+ else None
184
+ )
185
+ except json.JSONDecodeError as e:
186
+ logger.exception(f"Error in request: {e}")
187
+ raw_labels = None
188
+
189
+ if isinstance(raw_labels, dict):
190
+ customer_labels = {
191
+ label: value
192
+ for label, value in raw_labels.items()
193
+ if label in self.allowed_custom_labels
194
+ }
195
+ return customer_labels
@@ -1,9 +1,11 @@
1
+ from __future__ import annotations
2
+
1
3
  import copy
2
4
  import json
3
5
  import logging
4
6
  import time
5
7
  import uuid
6
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
8
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
7
9
 
8
10
  from fastapi import Request
9
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
@@ -33,13 +35,15 @@ from sglang.srt.entrypoints.openai.utils import (
33
35
  )
34
36
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
35
37
  from sglang.srt.managers.io_struct import GenerateReqInput
36
- from sglang.srt.managers.template_manager import TemplateManager
37
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
38
38
  from sglang.srt.parser.conversation import generate_chat_conv
39
39
  from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
40
40
  from sglang.srt.parser.reasoning_parser import ReasoningParser
41
41
  from sglang.utils import convert_json_schema_to_str
42
42
 
43
+ if TYPE_CHECKING:
44
+ from sglang.srt.managers.template_manager import TemplateManager
45
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
46
+
43
47
  logger = logging.getLogger(__name__)
44
48
 
45
49
 
@@ -53,6 +57,7 @@ class OpenAIServingChat(OpenAIServingBase):
53
57
  ):
54
58
  super().__init__(tokenizer_manager)
55
59
  self.template_manager = template_manager
60
+ self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
56
61
 
57
62
  def _request_id_prefix(self) -> str:
58
63
  return "chatcmpl-"
@@ -91,6 +96,7 @@ class OpenAIServingChat(OpenAIServingBase):
91
96
  def _convert_to_internal_request(
92
97
  self,
93
98
  request: ChatCompletionRequest,
99
+ raw_request: Request = None,
94
100
  ) -> tuple[GenerateReqInput, ChatCompletionRequest]:
95
101
  reasoning_effort = (
96
102
  request.chat_template_kwargs.pop("reasoning_effort", None)
@@ -122,6 +128,9 @@ class OpenAIServingChat(OpenAIServingBase):
122
128
  else:
123
129
  prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
124
130
 
131
+ # Extract customer labels from raw request headers
132
+ customer_labels = self.extract_customer_labels(raw_request)
133
+
125
134
  adapted_request = GenerateReqInput(
126
135
  **prompt_kwargs,
127
136
  image_data=processed_messages.image_data,
@@ -140,6 +149,7 @@ class OpenAIServingChat(OpenAIServingBase):
140
149
  bootstrap_room=request.bootstrap_room,
141
150
  return_hidden_states=request.return_hidden_states,
142
151
  rid=request.rid,
152
+ customer_labels=customer_labels,
143
153
  )
144
154
 
145
155
  return adapted_request, request
@@ -172,10 +182,11 @@ class OpenAIServingChat(OpenAIServingBase):
172
182
  ]
173
183
  else:
174
184
  tools = [item.function.model_dump() for item in request.tools]
175
-
176
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
177
- parser = FunctionCallParser(request.tools, tool_call_parser)
178
- tool_call_constraint = parser.get_structure_constraint(request.tool_choice)
185
+ if self.tool_call_parser:
186
+ parser = FunctionCallParser(request.tools, self.tool_call_parser)
187
+ tool_call_constraint = parser.get_structure_constraint(
188
+ request.tool_choice
189
+ )
179
190
 
180
191
  # Use chat template
181
192
  if self.template_manager.chat_template_name is None:
@@ -537,7 +548,11 @@ class OpenAIServingChat(OpenAIServingBase):
537
548
  yield f"data: {chunk.model_dump_json()}\n\n"
538
549
 
539
550
  # Handle tool calls
540
- if request.tool_choice != "none" and request.tools:
551
+ if (
552
+ request.tool_choice != "none"
553
+ and request.tools
554
+ and self.tool_call_parser
555
+ ):
541
556
  async for chunk in self._process_tool_call_stream(
542
557
  index,
543
558
  delta,
@@ -727,10 +742,13 @@ class OpenAIServingChat(OpenAIServingBase):
727
742
 
728
743
  # Handle tool calls
729
744
  tool_calls = None
730
- if request.tool_choice != "none" and request.tools:
731
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
745
+ if (
746
+ request.tool_choice != "none"
747
+ and request.tools
748
+ and self.tool_call_parser
749
+ ):
732
750
  tool_calls, text, finish_reason = self._process_tool_calls(
733
- text, request.tools, tool_call_parser, finish_reason
751
+ text, request.tools, finish_reason
734
752
  )
735
753
 
736
754
  choice_data = ChatCompletionResponseChoice(
@@ -824,11 +842,10 @@ class OpenAIServingChat(OpenAIServingBase):
824
842
  self,
825
843
  text: str,
826
844
  tools: List[Any],
827
- tool_call_parser: Optional[str],
828
845
  finish_reason: Dict[str, Any],
829
846
  ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
830
847
  """Process tool calls in the response"""
831
- parser = FunctionCallParser(tools, tool_call_parser)
848
+ parser = FunctionCallParser(tools, self.tool_call_parser)
832
849
  if parser.has_tool_call(text):
833
850
  if finish_reason["type"] == "stop":
834
851
  finish_reason["type"] = "tool_calls"
@@ -838,7 +855,10 @@ class OpenAIServingChat(OpenAIServingBase):
838
855
  tool_calls = []
839
856
  for call_info in call_info_list:
840
857
  # For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
841
- if tool_call_parser == "kimi_k2" and call_info.name is not None:
858
+ if (
859
+ self.tool_call_parser == "kimi_k2"
860
+ and call_info.name is not None
861
+ ):
842
862
  tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
843
863
  else:
844
864
  tool_id = f"call_{uuid.uuid4().hex[:24]}"
@@ -933,7 +953,7 @@ class OpenAIServingChat(OpenAIServingBase):
933
953
  if index not in parser_dict:
934
954
  parser_dict[index] = FunctionCallParser(
935
955
  tools=request.tools,
936
- tool_call_parser=self.tokenizer_manager.server_args.tool_call_parser,
956
+ tool_call_parser=self.tool_call_parser,
937
957
  )
938
958
  parser = parser_dict[index]
939
959
 
@@ -962,7 +982,7 @@ class OpenAIServingChat(OpenAIServingBase):
962
982
  # Tool call ID should be generated only once per tool call
963
983
  if call_item.name:
964
984
  # First chunk: include ID and function name
965
- if self.tokenizer_manager.server_args.tool_call_parser == "kimi_k2":
985
+ if self.tool_call_parser == "kimi_k2":
966
986
  # Align with Kimi-K2 format: functions.{name}:{index}
967
987
  tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
968
988
  else:
@@ -1,6 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import time
3
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
4
6
 
5
7
  from fastapi import Request
6
8
  from fastapi.responses import ORJSONResponse, StreamingResponse
@@ -20,13 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
20
22
  to_openai_style_logprobs,
21
23
  )
22
24
  from sglang.srt.managers.io_struct import GenerateReqInput
23
- from sglang.srt.managers.template_manager import TemplateManager
24
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
25
25
  from sglang.srt.parser.code_completion_parser import (
26
26
  generate_completion_prompt_from_request,
27
27
  )
28
28
  from sglang.utils import convert_json_schema_to_str
29
29
 
30
+ if TYPE_CHECKING:
31
+ from sglang.srt.managers.template_manager import TemplateManager
32
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
33
+
30
34
  logger = logging.getLogger(__name__)
31
35
 
32
36
 
@@ -55,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
55
59
  def _convert_to_internal_request(
56
60
  self,
57
61
  request: CompletionRequest,
62
+ raw_request: Request = None,
58
63
  ) -> tuple[GenerateReqInput, CompletionRequest]:
59
64
  """Convert OpenAI completion request to internal format"""
60
65
  # NOTE: with openai API, the prompt's logprobs are always not computed
@@ -85,6 +90,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
85
90
  else:
86
91
  prompt_kwargs = {"input_ids": prompt}
87
92
 
93
+ # Extract customer labels from raw request headers
94
+ customer_labels = self.extract_customer_labels(raw_request)
95
+
88
96
  adapted_request = GenerateReqInput(
89
97
  **prompt_kwargs,
90
98
  sampling_params=sampling_params,
@@ -99,6 +107,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
99
107
  bootstrap_room=request.bootstrap_room,
100
108
  return_hidden_states=request.return_hidden_states,
101
109
  rid=request.rid,
110
+ customer_labels=customer_labels,
102
111
  )
103
112
 
104
113
  return adapted_request, request
@@ -1,4 +1,6 @@
1
- from typing import Any, Dict, List, Optional, Union
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
2
4
 
3
5
  from fastapi import Request
4
6
  from fastapi.responses import ORJSONResponse
@@ -13,10 +15,12 @@ from sglang.srt.entrypoints.openai.protocol import (
13
15
  )
14
16
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
15
17
  from sglang.srt.managers.io_struct import EmbeddingReqInput
16
- from sglang.srt.managers.template_manager import TemplateManager
17
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
18
18
  from sglang.srt.parser.conversation import generate_embedding_convs
19
19
 
20
+ if TYPE_CHECKING:
21
+ from sglang.srt.managers.template_manager import TemplateManager
22
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
23
+
20
24
 
21
25
  class OpenAIServingEmbedding(OpenAIServingBase):
22
26
  """Handler for v1/embeddings requests"""
@@ -70,6 +74,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
70
74
  def _convert_to_internal_request(
71
75
  self,
72
76
  request: EmbeddingRequest,
77
+ raw_request: Request = None,
73
78
  ) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
74
79
  """Convert OpenAI embedding request to internal format"""
75
80
  prompt = request.input
@@ -45,7 +45,9 @@ class OpenAIServingRerank(OpenAIServingBase):
45
45
  return None
46
46
 
47
47
  def _convert_to_internal_request(
48
- self, request: V1RerankReqInput
48
+ self,
49
+ request: V1RerankReqInput,
50
+ raw_request: Request = None,
49
51
  ) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
50
52
  """Convert OpenAI rerank request to internal embedding format"""
51
53
  # Create pairs of [query, document] for each document
@@ -1,6 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Adapted from vLLM's OpenAIServingResponses
3
3
  """Handler for /v1/responses requests"""
4
+ from __future__ import annotations
4
5
 
5
6
  import asyncio
6
7
  import copy
@@ -9,7 +10,7 @@ import logging
9
10
  import time
10
11
  from contextlib import AsyncExitStack
11
12
  from http import HTTPStatus
12
- from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
13
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
13
14
 
14
15
  import jinja2
15
16
  import openai.types.responses as openai_responses_types
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
54
55
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
55
56
  from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
56
57
  from sglang.srt.managers.io_struct import GenerateReqInput
57
- from sglang.srt.managers.template_manager import TemplateManager
58
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
59
58
  from sglang.srt.parser.reasoning_parser import ReasoningParser
60
59
  from sglang.srt.utils import random_uuid
61
60
 
61
+ if TYPE_CHECKING:
62
+ from sglang.srt.managers.template_manager import TemplateManager
63
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
64
+
62
65
  logger = logging.getLogger(__name__)
63
66
 
64
67
 
@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
25
25
  def _convert_to_internal_request(
26
26
  self,
27
27
  request: ScoringRequest,
28
+ raw_request: Request = None,
28
29
  ) -> tuple[ScoringRequest, ScoringRequest]:
29
30
  """Convert OpenAI scoring request to internal format"""
30
31
  # For scoring, we pass the request directly as the tokenizer_manager
@@ -55,7 +55,7 @@ class EPLBManager:
55
55
  enable_timing = self._rebalance_layers_per_chunk is None
56
56
 
57
57
  if enable_timing:
58
- torch.cuda.synchronize()
58
+ torch.get_device_module().synchronize()
59
59
  time_start = time.time()
60
60
 
61
61
  dump_record_output = get_global_expert_distribution_recorder().dump_record(
@@ -85,7 +85,7 @@ class EPLBManager:
85
85
 
86
86
  msg = f"[EPLBManager] rebalance end"
87
87
  if enable_timing:
88
- torch.cuda.synchronize()
88
+ torch.get_device_module().synchronize()
89
89
  time_end = time.time()
90
90
  msg += f" time={time_end - time_start:.3f}s"
91
91
  logger.info(msg)