sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/lang/interpreter.py +1 -1
  4. sglang/srt/configs/__init__.py +4 -0
  5. sglang/srt/configs/device_config.py +3 -1
  6. sglang/srt/configs/dots_vlm.py +139 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/load_config.py +1 -0
  9. sglang/srt/configs/model_config.py +50 -6
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +8 -1
  12. sglang/srt/connector/remote_instance.py +82 -0
  13. sglang/srt/constrained/base_grammar_backend.py +48 -12
  14. sglang/srt/constrained/llguidance_backend.py +0 -1
  15. sglang/srt/constrained/outlines_backend.py +0 -1
  16. sglang/srt/constrained/xgrammar_backend.py +28 -9
  17. sglang/srt/custom_op.py +11 -1
  18. sglang/srt/debug_utils/dump_comparator.py +81 -44
  19. sglang/srt/debug_utils/dump_loader.py +97 -0
  20. sglang/srt/debug_utils/dumper.py +11 -3
  21. sglang/srt/debug_utils/text_comparator.py +73 -11
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +21 -10
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  26. sglang/srt/disaggregation/fake/conn.py +1 -1
  27. sglang/srt/disaggregation/mini_lb.py +6 -445
  28. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  29. sglang/srt/disaggregation/nixl/conn.py +180 -16
  30. sglang/srt/disaggregation/prefill.py +5 -3
  31. sglang/srt/disaggregation/utils.py +5 -50
  32. sglang/srt/distributed/parallel_state.py +67 -43
  33. sglang/srt/entrypoints/engine.py +38 -17
  34. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  35. sglang/srt/entrypoints/grpc_server.py +680 -0
  36. sglang/srt/entrypoints/http_server.py +88 -53
  37. sglang/srt/entrypoints/openai/protocol.py +7 -4
  38. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  39. sglang/srt/entrypoints/openai/serving_chat.py +39 -19
  40. sglang/srt/entrypoints/openai/serving_completions.py +15 -4
  41. sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
  42. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  43. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  44. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  45. sglang/srt/eplb/eplb_manager.py +2 -2
  46. sglang/srt/eplb/expert_distribution.py +26 -13
  47. sglang/srt/eplb/expert_location.py +8 -3
  48. sglang/srt/eplb/expert_location_updater.py +1 -1
  49. sglang/srt/function_call/base_format_detector.py +3 -6
  50. sglang/srt/function_call/ebnf_composer.py +11 -9
  51. sglang/srt/function_call/function_call_parser.py +6 -0
  52. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  53. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  54. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  55. sglang/srt/grpc/__init__.py +1 -0
  56. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  57. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  58. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  59. sglang/srt/hf_transformers_utils.py +4 -0
  60. sglang/srt/layers/activation.py +142 -9
  61. sglang/srt/layers/attention/aiter_backend.py +93 -68
  62. sglang/srt/layers/attention/ascend_backend.py +11 -4
  63. sglang/srt/layers/attention/fla/chunk.py +242 -0
  64. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  65. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  66. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  67. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  68. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  69. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  70. sglang/srt/layers/attention/fla/index.py +37 -0
  71. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  72. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  73. sglang/srt/layers/attention/fla/op.py +66 -0
  74. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  75. sglang/srt/layers/attention/fla/utils.py +331 -0
  76. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  77. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  78. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  79. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  80. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  81. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  82. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  83. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  84. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  85. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  86. sglang/srt/layers/attention/triton_backend.py +18 -1
  87. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  88. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  89. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  90. sglang/srt/layers/communicator.py +45 -7
  91. sglang/srt/layers/dp_attention.py +30 -1
  92. sglang/srt/layers/layernorm.py +32 -15
  93. sglang/srt/layers/linear.py +34 -3
  94. sglang/srt/layers/logits_processor.py +29 -10
  95. sglang/srt/layers/moe/__init__.py +2 -1
  96. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  97. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  98. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  99. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  100. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  107. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  108. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  113. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  114. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  115. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  116. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  117. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  118. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  119. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  120. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  121. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  122. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  123. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  124. sglang/srt/layers/moe/topk.py +30 -9
  125. sglang/srt/layers/moe/utils.py +12 -7
  126. sglang/srt/layers/quantization/awq.py +19 -7
  127. sglang/srt/layers/quantization/base_config.py +11 -6
  128. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  129. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  130. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  131. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  132. sglang/srt/layers/quantization/fp8.py +76 -47
  133. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  134. sglang/srt/layers/quantization/gptq.py +25 -17
  135. sglang/srt/layers/quantization/modelopt_quant.py +182 -49
  136. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  137. sglang/srt/layers/quantization/mxfp4.py +68 -41
  138. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  139. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  140. sglang/srt/layers/quantization/quark/utils.py +97 -0
  141. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  142. sglang/srt/layers/quantization/unquant.py +135 -47
  143. sglang/srt/layers/quantization/w4afp8.py +30 -17
  144. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  145. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  146. sglang/srt/layers/rocm_linear_utils.py +44 -0
  147. sglang/srt/layers/rotary_embedding.py +0 -18
  148. sglang/srt/layers/sampler.py +162 -18
  149. sglang/srt/lora/backend/base_backend.py +50 -8
  150. sglang/srt/lora/backend/triton_backend.py +90 -2
  151. sglang/srt/lora/layers.py +32 -0
  152. sglang/srt/lora/lora.py +4 -1
  153. sglang/srt/lora/lora_manager.py +35 -112
  154. sglang/srt/lora/mem_pool.py +24 -10
  155. sglang/srt/lora/utils.py +18 -9
  156. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  157. sglang/srt/managers/cache_controller.py +200 -199
  158. sglang/srt/managers/data_parallel_controller.py +105 -35
  159. sglang/srt/managers/detokenizer_manager.py +8 -4
  160. sglang/srt/managers/disagg_service.py +46 -0
  161. sglang/srt/managers/io_struct.py +199 -12
  162. sglang/srt/managers/mm_utils.py +1 -0
  163. sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
  164. sglang/srt/managers/schedule_batch.py +77 -56
  165. sglang/srt/managers/schedule_policy.py +4 -3
  166. sglang/srt/managers/scheduler.py +191 -139
  167. sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
  168. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  169. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  170. sglang/srt/managers/template_manager.py +3 -3
  171. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  172. sglang/srt/managers/tokenizer_manager.py +260 -519
  173. sglang/srt/managers/tp_worker.py +53 -4
  174. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  175. sglang/srt/mem_cache/allocator.py +1 -1
  176. sglang/srt/mem_cache/hicache_storage.py +18 -33
  177. sglang/srt/mem_cache/hiradix_cache.py +108 -48
  178. sglang/srt/mem_cache/memory_pool.py +347 -48
  179. sglang/srt/mem_cache/memory_pool_host.py +121 -57
  180. sglang/srt/mem_cache/radix_cache.py +0 -2
  181. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  182. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  183. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
  184. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  185. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  186. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
  187. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  188. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  189. sglang/srt/metrics/collector.py +502 -77
  190. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  191. sglang/srt/metrics/utils.py +48 -0
  192. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  193. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  194. sglang/srt/model_executor/forward_batch_info.py +75 -19
  195. sglang/srt/model_executor/model_runner.py +357 -30
  196. sglang/srt/model_loader/__init__.py +9 -3
  197. sglang/srt/model_loader/loader.py +128 -4
  198. sglang/srt/model_loader/weight_utils.py +2 -1
  199. sglang/srt/models/apertus.py +686 -0
  200. sglang/srt/models/bailing_moe.py +798 -218
  201. sglang/srt/models/bailing_moe_nextn.py +168 -0
  202. sglang/srt/models/deepseek_v2.py +346 -48
  203. sglang/srt/models/dots_vlm.py +174 -0
  204. sglang/srt/models/dots_vlm_vit.py +337 -0
  205. sglang/srt/models/ernie4.py +1 -1
  206. sglang/srt/models/gemma3n_mm.py +1 -1
  207. sglang/srt/models/glm4_moe.py +11 -2
  208. sglang/srt/models/glm4v.py +4 -2
  209. sglang/srt/models/glm4v_moe.py +3 -0
  210. sglang/srt/models/gpt_oss.py +1 -1
  211. sglang/srt/models/internvl.py +28 -0
  212. sglang/srt/models/llama4.py +9 -0
  213. sglang/srt/models/llama_eagle3.py +13 -0
  214. sglang/srt/models/longcat_flash.py +2 -2
  215. sglang/srt/models/minicpmv.py +165 -3
  216. sglang/srt/models/mllama4.py +25 -0
  217. sglang/srt/models/opt.py +637 -0
  218. sglang/srt/models/qwen2.py +7 -0
  219. sglang/srt/models/qwen2_5_vl.py +27 -3
  220. sglang/srt/models/qwen2_moe.py +60 -13
  221. sglang/srt/models/qwen3.py +8 -2
  222. sglang/srt/models/qwen3_moe.py +40 -9
  223. sglang/srt/models/qwen3_next.py +1042 -0
  224. sglang/srt/models/qwen3_next_mtp.py +112 -0
  225. sglang/srt/models/step3_vl.py +1 -1
  226. sglang/srt/models/torch_native_llama.py +1 -1
  227. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  228. sglang/srt/multimodal/processors/glm4v.py +9 -9
  229. sglang/srt/multimodal/processors/internvl.py +141 -129
  230. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  231. sglang/srt/offloader.py +27 -3
  232. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  233. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  234. sglang/srt/sampling/sampling_batch_info.py +18 -15
  235. sglang/srt/server_args.py +355 -37
  236. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  237. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  238. sglang/srt/speculative/eagle_utils.py +0 -2
  239. sglang/srt/speculative/eagle_worker.py +197 -112
  240. sglang/srt/speculative/spec_info.py +5 -0
  241. sglang/srt/speculative/standalone_worker.py +109 -0
  242. sglang/srt/tracing/trace.py +552 -0
  243. sglang/srt/utils.py +46 -3
  244. sglang/srt/weight_sync/utils.py +1 -1
  245. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  246. sglang/test/few_shot_gsm8k.py +1 -0
  247. sglang/test/runners.py +4 -0
  248. sglang/test/test_cutlass_moe.py +24 -6
  249. sglang/test/test_disaggregation_utils.py +66 -0
  250. sglang/test/test_fp4_moe.py +370 -1
  251. sglang/test/test_utils.py +28 -1
  252. sglang/utils.py +12 -0
  253. sglang/version.py +1 -1
  254. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  255. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
  256. sglang/srt/disaggregation/launch_lb.py +0 -118
  257. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  258. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  259. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  260. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  261. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  262. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  263. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  264. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  265. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,11 @@ import tempfile
27
27
  import threading
28
28
  import time
29
29
  from http import HTTPStatus
30
- from typing import Any, AsyncIterator, Callable, Dict, List, Optional
30
+ from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
31
+
32
+ import setproctitle
33
+
34
+ from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
31
35
 
32
36
  # Fix a bug of Python threading
33
37
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -45,11 +49,7 @@ from fastapi.exceptions import RequestValidationError
45
49
  from fastapi.middleware.cors import CORSMiddleware
46
50
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
47
51
 
48
- from sglang.srt.disaggregation.utils import (
49
- FAKE_BOOTSTRAP_HOST,
50
- DisaggregationMode,
51
- register_disaggregation_server,
52
- )
52
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
53
53
  from sglang.srt.entrypoints.engine import _launch_subprocesses
54
54
  from sglang.srt.entrypoints.openai.protocol import (
55
55
  ChatCompletionRequest,
@@ -75,6 +75,7 @@ from sglang.srt.managers.io_struct import (
75
75
  EmbeddingReqInput,
76
76
  GenerateReqInput,
77
77
  GetWeightsByNameReqInput,
78
+ InitWeightsSendGroupForRemoteInstanceReqInput,
78
79
  InitWeightsUpdateGroupReqInput,
79
80
  LoadLoRAAdapterReqInput,
80
81
  OpenSessionReqInput,
@@ -82,6 +83,7 @@ from sglang.srt.managers.io_struct import (
82
83
  ProfileReqInput,
83
84
  ReleaseMemoryOccupationReqInput,
84
85
  ResumeMemoryOccupationReqInput,
86
+ SendWeightsToRemoteInstanceReqInput,
85
87
  SeparateReasoningReqInput,
86
88
  SetInternalStateReq,
87
89
  SlowDownReqInput,
@@ -94,15 +96,16 @@ from sglang.srt.managers.io_struct import (
94
96
  )
95
97
  from sglang.srt.managers.multi_tokenizer_mixin import (
96
98
  MultiTokenizerManager,
97
- deserialize_data,
99
+ MultiTokenizerRouter,
98
100
  get_main_process_id,
101
+ monkey_patch_uvicorn_multiprocessing,
99
102
  read_from_shared_memory,
100
103
  write_data_for_multi_tokenizer,
101
104
  )
102
105
  from sglang.srt.managers.template_manager import TemplateManager
103
106
  from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
104
107
  from sglang.srt.metrics.func_timer import enable_func_timer
105
- from sglang.srt.reasoning_parser import ReasoningParser
108
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
106
109
  from sglang.srt.server_args import PortArgs, ServerArgs
107
110
  from sglang.srt.utils import (
108
111
  add_api_key_middleware,
@@ -125,7 +128,9 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
125
128
  # Store global states
126
129
  @dataclasses.dataclass
127
130
  class _GlobalState:
128
- tokenizer_manager: TokenizerManager
131
+ tokenizer_manager: Union[
132
+ TokenizerManager, MultiTokenizerRouter, MultiTokenizerManager
133
+ ]
129
134
  template_manager: TemplateManager
130
135
  scheduler_info: Dict
131
136
 
@@ -138,21 +143,6 @@ def set_global_state(global_state: _GlobalState):
138
143
  _global_state = global_state
139
144
 
140
145
 
141
- # Function to set up all middlewares for multi-tokenizer compatibility
142
- def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
143
- """Setup all middlewares for both single and multi-process modes"""
144
- worker_pid = os.getpid()
145
-
146
- if api_key:
147
- add_api_key_middleware(app, api_key)
148
- logger.info(f"Worker {worker_pid} added API key middleware")
149
-
150
- if enable_metrics:
151
- add_prometheus_middleware(app)
152
- enable_func_timer()
153
- logger.info(f"Worker {worker_pid} added prometheus middleware")
154
-
155
-
156
146
  async def init_multi_tokenizer() -> ServerArgs:
157
147
  """Read args information from shm and init tokenizer manager for current process"""
158
148
  pid = os.getpid()
@@ -160,11 +150,15 @@ async def init_multi_tokenizer() -> ServerArgs:
160
150
  logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
161
151
 
162
152
  # Read configuration from shared memory
163
- port_args_data = read_from_shared_memory(f"port_args_{main_pid}")
164
- server_args_data = read_from_shared_memory(f"server_args_{main_pid}")
165
- scheduler_info_data = read_from_shared_memory(f"scheduler_info_{main_pid}")
166
- port_args, server_args = deserialize_data(port_args_data, server_args_data)
167
- scheduler_info = scheduler_info_data
153
+ port_args, server_args, scheduler_info = read_from_shared_memory(
154
+ f"multi_tokenizer_args_{main_pid}"
155
+ )
156
+ server_args: ServerArgs
157
+
158
+ # API key authentication is not supported in multi-tokenizer mode
159
+ assert (
160
+ server_args.api_key is None
161
+ ), "API key is not supported in multi-tokenizer mode"
168
162
 
169
163
  port_args.tokenizer_ipc_name = (
170
164
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
@@ -190,18 +184,29 @@ async def init_multi_tokenizer() -> ServerArgs:
190
184
  scheduler_info=scheduler_info,
191
185
  )
192
186
  )
187
+
188
+ if server_args.enable_trace:
189
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
190
+ if server_args.disaggregation_mode == "null":
191
+ thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
192
+ trace_set_thread_info(thread_label)
193
+
193
194
  return server_args
194
195
 
195
196
 
196
197
  @asynccontextmanager
197
198
  async def lifespan(fast_api_app: FastAPI):
198
- server_args = getattr(fast_api_app, "server_args", None)
199
- if server_args is None:
199
+ if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
200
200
  # Initialize multi-tokenizer support for worker processes
201
- fast_api_app.server_args = await init_multi_tokenizer()
202
- setup_middlewares(
203
- fast_api_app.server_args.api_key, fast_api_app.server_args.enable_metrics
204
- )
201
+ fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
202
+
203
+ # only metrics middleware is supported in multi-tokenizer mode
204
+ worker_pid = os.getpid()
205
+ if fast_api_app.server_args.enable_metrics:
206
+ add_prometheus_middleware(app)
207
+ enable_func_timer()
208
+
209
+ logger.info(f"Worker {worker_pid} added prometheus middleware")
205
210
  fast_api_app.warmup_thread = threading.Thread(
206
211
  target=_wait_and_warmup,
207
212
  args=(
@@ -679,6 +684,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
679
684
  )
680
685
 
681
686
 
687
+ @app.post("/init_weights_send_group_for_remote_instance")
688
+ async def init_weights_send_group_for_remote_instance(
689
+ obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
690
+ ):
691
+ success, message = (
692
+ await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
693
+ obj, request
694
+ )
695
+ )
696
+ content = {"success": success, "message": message}
697
+ if success:
698
+ return ORJSONResponse(content, status_code=200)
699
+ else:
700
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
701
+
702
+
703
+ @app.post("/send_weights_to_remote_instance")
704
+ async def send_weights_to_remote_instance(
705
+ obj: SendWeightsToRemoteInstanceReqInput, request: Request
706
+ ):
707
+ success, message = (
708
+ await _global_state.tokenizer_manager.send_weights_to_remote_instance(
709
+ obj, request
710
+ )
711
+ )
712
+ content = {"success": success, "message": message}
713
+ if success:
714
+ return ORJSONResponse(content, status_code=200)
715
+ else:
716
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
717
+
718
+
682
719
  @app.post("/init_weights_update_group")
683
720
  async def init_weights_update_group(
684
721
  obj: InitWeightsUpdateGroupReqInput, request: Request
@@ -1178,6 +1215,12 @@ def launch_server(
1178
1215
  server_args=server_args,
1179
1216
  )
1180
1217
 
1218
+ if server_args.enable_trace:
1219
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
1220
+ if server_args.disaggregation_mode == "null":
1221
+ thread_label = "Tokenizer"
1222
+ trace_set_thread_info(thread_label)
1223
+
1181
1224
  set_global_state(
1182
1225
  _GlobalState(
1183
1226
  tokenizer_manager=tokenizer_manager,
@@ -1187,12 +1230,10 @@ def launch_server(
1187
1230
  )
1188
1231
 
1189
1232
  if server_args.tokenizer_worker_num > 1:
1190
- port_args_shm, server_args_shm, scheduler_info_shm = (
1191
- write_data_for_multi_tokenizer(
1192
- port_args,
1193
- server_args,
1194
- scheduler_info,
1195
- )
1233
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1234
+ port_args,
1235
+ server_args,
1236
+ scheduler_info,
1196
1237
  )
1197
1238
  else:
1198
1239
  # Add api key authorization
@@ -1229,6 +1270,9 @@ def launch_server(
1229
1270
  "level": "INFO",
1230
1271
  "propagate": False,
1231
1272
  }
1273
+
1274
+ monkey_patch_uvicorn_multiprocessing()
1275
+
1232
1276
  uvicorn.run(
1233
1277
  "sglang.srt.entrypoints.http_server:app",
1234
1278
  host=server_args.host,
@@ -1239,6 +1283,7 @@ def launch_server(
1239
1283
  workers=server_args.tokenizer_worker_num,
1240
1284
  )
1241
1285
  else:
1286
+ app.is_single_tokenizer_mode = True
1242
1287
  uvicorn.run(
1243
1288
  app,
1244
1289
  host=server_args.host,
@@ -1249,10 +1294,8 @@ def launch_server(
1249
1294
  )
1250
1295
  finally:
1251
1296
  if server_args.tokenizer_worker_num > 1:
1252
- port_args_shm.unlink()
1253
- server_args_shm.unlink()
1254
- scheduler_info_shm.unlink()
1255
- _global_state.tokenizer_manager.clear_tokenizer_mapping()
1297
+ multi_tokenizer_args_shm.unlink()
1298
+ _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1256
1299
  else:
1257
1300
  warmup_thread.join()
1258
1301
 
@@ -1401,13 +1444,5 @@ def _wait_and_warmup(
1401
1444
  if server_args.debug_tensor_dump_input_file:
1402
1445
  kill_process_tree(os.getpid())
1403
1446
 
1404
- if server_args.pdlb_url is not None:
1405
- register_disaggregation_server(
1406
- server_args.disaggregation_mode,
1407
- server_args.port,
1408
- server_args.disaggregation_bootstrap_port,
1409
- server_args.pdlb_url,
1410
- )
1411
-
1412
1447
  if launch_callback is not None:
1413
1448
  launch_callback()
@@ -229,6 +229,9 @@ class CompletionRequest(BaseModel):
229
229
  # For request id
230
230
  rid: Optional[Union[List[str], str]] = None
231
231
 
232
+ # For customer metric labels
233
+ customer_labels: Optional[Dict[str, str]] = None
234
+
232
235
  @field_validator("max_tokens")
233
236
  @classmethod
234
237
  def validate_max_tokens_positive(cls, v):
@@ -447,7 +450,7 @@ class ChatCompletionRequest(BaseModel):
447
450
  description="Constrains effort on reasoning for reasoning models. "
448
451
  "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
449
452
  "result in faster responses and fewer tokens used on reasoning in a response. "
450
- "Currently only supported for OpenAI models.",
453
+ "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
451
454
  )
452
455
 
453
456
  @model_validator(mode="before")
@@ -542,9 +545,9 @@ class ChatCompletionRequest(BaseModel):
542
545
  rid: Optional[Union[List[str], str]] = None
543
546
 
544
547
  # For PD disaggregation
545
- bootstrap_host: Optional[str] = None
546
- bootstrap_port: Optional[int] = None
547
- bootstrap_room: Optional[int] = None
548
+ bootstrap_host: Optional[Union[List[str], str]] = None
549
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
550
+ bootstrap_room: Optional[Union[List[int], int]] = None
548
551
 
549
552
 
550
553
  class ChatMessage(BaseModel):
@@ -1,15 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import uuid
4
6
  from abc import ABC, abstractmethod
5
- from typing import Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
8
 
7
9
  from fastapi import HTTPException, Request
8
10
  from fastapi.responses import ORJSONResponse, StreamingResponse
9
11
 
10
12
  from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
11
13
  from sglang.srt.managers.io_struct import GenerateReqInput
12
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
14
+ from sglang.srt.server_args import ServerArgs
15
+
16
+ if TYPE_CHECKING:
17
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
13
18
 
14
19
  logger = logging.getLogger(__name__)
15
20
 
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
20
25
 
21
26
  def __init__(self, tokenizer_manager: TokenizerManager):
22
27
  self.tokenizer_manager = tokenizer_manager
28
+ self.allowed_custom_labels = (
29
+ set(
30
+ self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
31
+ )
32
+ if isinstance(self.tokenizer_manager.server_args, ServerArgs)
33
+ and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
34
+ else None
35
+ )
23
36
 
24
37
  async def handle_request(
25
38
  self, request: OpenAIServingRequest, raw_request: Request
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
33
46
 
34
47
  # Convert to internal format
35
48
  adapted_request, processed_request = self._convert_to_internal_request(
36
- request
49
+ request, raw_request
37
50
  )
38
51
 
39
52
  # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
@@ -77,6 +90,7 @@ class OpenAIServingBase(ABC):
77
90
  def _convert_to_internal_request(
78
91
  self,
79
92
  request: OpenAIServingRequest,
93
+ raw_request: Request = None,
80
94
  ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
81
95
  """Convert OpenAI request to internal format"""
82
96
  pass
@@ -150,3 +164,32 @@ class OpenAIServingBase(ABC):
150
164
  code=status_code,
151
165
  )
152
166
  return json.dumps({"error": error.model_dump()})
167
+
168
+ def extract_customer_labels(self, raw_request):
169
+ if (
170
+ not self.allowed_custom_labels
171
+ or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
172
+ ):
173
+ return None
174
+
175
+ customer_labels = None
176
+ header = (
177
+ self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
178
+ )
179
+ try:
180
+ raw_labels = (
181
+ json.loads(raw_request.headers.get(header))
182
+ if raw_request and raw_request.headers.get(header)
183
+ else None
184
+ )
185
+ except json.JSONDecodeError as e:
186
+ logger.exception(f"Error in request: {e}")
187
+ raw_labels = None
188
+
189
+ if isinstance(raw_labels, dict):
190
+ customer_labels = {
191
+ label: value
192
+ for label, value in raw_labels.items()
193
+ if label in self.allowed_custom_labels
194
+ }
195
+ return customer_labels
@@ -1,14 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  import copy
2
4
  import json
3
5
  import logging
4
6
  import time
5
7
  import uuid
6
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
8
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
7
9
 
8
10
  from fastapi import Request
9
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
10
12
 
11
- from sglang.srt.conversation import generate_chat_conv
12
13
  from sglang.srt.entrypoints.openai.protocol import (
13
14
  ChatCompletionRequest,
14
15
  ChatCompletionResponse,
@@ -33,13 +34,16 @@ from sglang.srt.entrypoints.openai.utils import (
33
34
  to_openai_style_logprobs,
34
35
  )
35
36
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
36
- from sglang.srt.jinja_template_utils import process_content_for_template_format
37
37
  from sglang.srt.managers.io_struct import GenerateReqInput
38
- from sglang.srt.managers.template_manager import TemplateManager
39
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
40
- from sglang.srt.reasoning_parser import ReasoningParser
38
+ from sglang.srt.parser.conversation import generate_chat_conv
39
+ from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
40
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
41
41
  from sglang.utils import convert_json_schema_to_str
42
42
 
43
+ if TYPE_CHECKING:
44
+ from sglang.srt.managers.template_manager import TemplateManager
45
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
46
+
43
47
  logger = logging.getLogger(__name__)
44
48
 
45
49
 
@@ -53,6 +57,7 @@ class OpenAIServingChat(OpenAIServingBase):
53
57
  ):
54
58
  super().__init__(tokenizer_manager)
55
59
  self.template_manager = template_manager
60
+ self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
56
61
 
57
62
  def _request_id_prefix(self) -> str:
58
63
  return "chatcmpl-"
@@ -91,6 +96,7 @@ class OpenAIServingChat(OpenAIServingBase):
91
96
  def _convert_to_internal_request(
92
97
  self,
93
98
  request: ChatCompletionRequest,
99
+ raw_request: Request = None,
94
100
  ) -> tuple[GenerateReqInput, ChatCompletionRequest]:
95
101
  reasoning_effort = (
96
102
  request.chat_template_kwargs.pop("reasoning_effort", None)
@@ -122,6 +128,9 @@ class OpenAIServingChat(OpenAIServingBase):
122
128
  else:
123
129
  prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
124
130
 
131
+ # Extract customer labels from raw request headers
132
+ customer_labels = self.extract_customer_labels(raw_request)
133
+
125
134
  adapted_request = GenerateReqInput(
126
135
  **prompt_kwargs,
127
136
  image_data=processed_messages.image_data,
@@ -140,6 +149,7 @@ class OpenAIServingChat(OpenAIServingBase):
140
149
  bootstrap_room=request.bootstrap_room,
141
150
  return_hidden_states=request.return_hidden_states,
142
151
  rid=request.rid,
152
+ customer_labels=customer_labels,
143
153
  )
144
154
 
145
155
  return adapted_request, request
@@ -172,10 +182,11 @@ class OpenAIServingChat(OpenAIServingBase):
172
182
  ]
173
183
  else:
174
184
  tools = [item.function.model_dump() for item in request.tools]
175
-
176
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
177
- parser = FunctionCallParser(request.tools, tool_call_parser)
178
- tool_call_constraint = parser.get_structure_constraint(request.tool_choice)
185
+ if self.tool_call_parser:
186
+ parser = FunctionCallParser(request.tools, self.tool_call_parser)
187
+ tool_call_constraint = parser.get_structure_constraint(
188
+ request.tool_choice
189
+ )
179
190
 
180
191
  # Use chat template
181
192
  if self.template_manager.chat_template_name is None:
@@ -537,7 +548,11 @@ class OpenAIServingChat(OpenAIServingBase):
537
548
  yield f"data: {chunk.model_dump_json()}\n\n"
538
549
 
539
550
  # Handle tool calls
540
- if request.tool_choice != "none" and request.tools:
551
+ if (
552
+ request.tool_choice != "none"
553
+ and request.tools
554
+ and self.tool_call_parser
555
+ ):
541
556
  async for chunk in self._process_tool_call_stream(
542
557
  index,
543
558
  delta,
@@ -727,10 +742,13 @@ class OpenAIServingChat(OpenAIServingBase):
727
742
 
728
743
  # Handle tool calls
729
744
  tool_calls = None
730
- if request.tool_choice != "none" and request.tools:
731
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
745
+ if (
746
+ request.tool_choice != "none"
747
+ and request.tools
748
+ and self.tool_call_parser
749
+ ):
732
750
  tool_calls, text, finish_reason = self._process_tool_calls(
733
- text, request.tools, tool_call_parser, finish_reason
751
+ text, request.tools, finish_reason
734
752
  )
735
753
 
736
754
  choice_data = ChatCompletionResponseChoice(
@@ -824,11 +842,10 @@ class OpenAIServingChat(OpenAIServingBase):
824
842
  self,
825
843
  text: str,
826
844
  tools: List[Any],
827
- tool_call_parser: Optional[str],
828
845
  finish_reason: Dict[str, Any],
829
846
  ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
830
847
  """Process tool calls in the response"""
831
- parser = FunctionCallParser(tools, tool_call_parser)
848
+ parser = FunctionCallParser(tools, self.tool_call_parser)
832
849
  if parser.has_tool_call(text):
833
850
  if finish_reason["type"] == "stop":
834
851
  finish_reason["type"] = "tool_calls"
@@ -838,7 +855,10 @@ class OpenAIServingChat(OpenAIServingBase):
838
855
  tool_calls = []
839
856
  for call_info in call_info_list:
840
857
  # For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
841
- if tool_call_parser == "kimi_k2" and call_info.name is not None:
858
+ if (
859
+ self.tool_call_parser == "kimi_k2"
860
+ and call_info.name is not None
861
+ ):
842
862
  tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
843
863
  else:
844
864
  tool_id = f"call_{uuid.uuid4().hex[:24]}"
@@ -933,7 +953,7 @@ class OpenAIServingChat(OpenAIServingBase):
933
953
  if index not in parser_dict:
934
954
  parser_dict[index] = FunctionCallParser(
935
955
  tools=request.tools,
936
- tool_call_parser=self.tokenizer_manager.server_args.tool_call_parser,
956
+ tool_call_parser=self.tool_call_parser,
937
957
  )
938
958
  parser = parser_dict[index]
939
959
 
@@ -962,7 +982,7 @@ class OpenAIServingChat(OpenAIServingBase):
962
982
  # Tool call ID should be generated only once per tool call
963
983
  if call_item.name:
964
984
  # First chunk: include ID and function name
965
- if self.tokenizer_manager.server_args.tool_call_parser == "kimi_k2":
985
+ if self.tool_call_parser == "kimi_k2":
966
986
  # Align with Kimi-K2 format: functions.{name}:{index}
967
987
  tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
968
988
  else:
@@ -1,11 +1,12 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import time
3
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
4
6
 
5
7
  from fastapi import Request
6
8
  from fastapi.responses import ORJSONResponse, StreamingResponse
7
9
 
8
- from sglang.srt.code_completion_parser import generate_completion_prompt_from_request
9
10
  from sglang.srt.entrypoints.openai.protocol import (
10
11
  CompletionRequest,
11
12
  CompletionResponse,
@@ -21,10 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
21
22
  to_openai_style_logprobs,
22
23
  )
23
24
  from sglang.srt.managers.io_struct import GenerateReqInput
24
- from sglang.srt.managers.template_manager import TemplateManager
25
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
25
+ from sglang.srt.parser.code_completion_parser import (
26
+ generate_completion_prompt_from_request,
27
+ )
26
28
  from sglang.utils import convert_json_schema_to_str
27
29
 
30
+ if TYPE_CHECKING:
31
+ from sglang.srt.managers.template_manager import TemplateManager
32
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
33
+
28
34
  logger = logging.getLogger(__name__)
29
35
 
30
36
 
@@ -53,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
53
59
  def _convert_to_internal_request(
54
60
  self,
55
61
  request: CompletionRequest,
62
+ raw_request: Request = None,
56
63
  ) -> tuple[GenerateReqInput, CompletionRequest]:
57
64
  """Convert OpenAI completion request to internal format"""
58
65
  # NOTE: with openai API, the prompt's logprobs are always not computed
@@ -83,6 +90,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
83
90
  else:
84
91
  prompt_kwargs = {"input_ids": prompt}
85
92
 
93
+ # Extract customer labels from raw request headers
94
+ customer_labels = self.extract_customer_labels(raw_request)
95
+
86
96
  adapted_request = GenerateReqInput(
87
97
  **prompt_kwargs,
88
98
  sampling_params=sampling_params,
@@ -97,6 +107,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
97
107
  bootstrap_room=request.bootstrap_room,
98
108
  return_hidden_states=request.return_hidden_states,
99
109
  rid=request.rid,
110
+ customer_labels=customer_labels,
100
111
  )
101
112
 
102
113
  return adapted_request, request
@@ -1,9 +1,10 @@
1
- from typing import Any, Dict, List, Optional, Union
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
2
4
 
3
5
  from fastapi import Request
4
6
  from fastapi.responses import ORJSONResponse
5
7
 
6
- from sglang.srt.conversation import generate_embedding_convs
7
8
  from sglang.srt.entrypoints.openai.protocol import (
8
9
  EmbeddingObject,
9
10
  EmbeddingRequest,
@@ -14,8 +15,11 @@ from sglang.srt.entrypoints.openai.protocol import (
14
15
  )
15
16
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
16
17
  from sglang.srt.managers.io_struct import EmbeddingReqInput
17
- from sglang.srt.managers.template_manager import TemplateManager
18
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
18
+ from sglang.srt.parser.conversation import generate_embedding_convs
19
+
20
+ if TYPE_CHECKING:
21
+ from sglang.srt.managers.template_manager import TemplateManager
22
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
19
23
 
20
24
 
21
25
  class OpenAIServingEmbedding(OpenAIServingBase):
@@ -70,6 +74,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
70
74
  def _convert_to_internal_request(
71
75
  self,
72
76
  request: EmbeddingRequest,
77
+ raw_request: Request = None,
73
78
  ) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
74
79
  """Convert OpenAI embedding request to internal format"""
75
80
  prompt = request.input
@@ -45,7 +45,9 @@ class OpenAIServingRerank(OpenAIServingBase):
45
45
  return None
46
46
 
47
47
  def _convert_to_internal_request(
48
- self, request: V1RerankReqInput
48
+ self,
49
+ request: V1RerankReqInput,
50
+ raw_request: Request = None,
49
51
  ) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
50
52
  """Convert OpenAI rerank request to internal embedding format"""
51
53
  # Create pairs of [query, document] for each document
@@ -1,6 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Adapted from vLLM's OpenAIServingResponses
3
3
  """Handler for /v1/responses requests"""
4
+ from __future__ import annotations
4
5
 
5
6
  import asyncio
6
7
  import copy
@@ -9,7 +10,7 @@ import logging
9
10
  import time
10
11
  from contextlib import AsyncExitStack
11
12
  from http import HTTPStatus
12
- from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
13
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
13
14
 
14
15
  import jinja2
15
16
  import openai.types.responses as openai_responses_types
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
54
55
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
55
56
  from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
56
57
  from sglang.srt.managers.io_struct import GenerateReqInput
57
- from sglang.srt.managers.template_manager import TemplateManager
58
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
59
- from sglang.srt.reasoning_parser import ReasoningParser
58
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
60
59
  from sglang.srt.utils import random_uuid
61
60
 
61
+ if TYPE_CHECKING:
62
+ from sglang.srt.managers.template_manager import TemplateManager
63
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
64
+
62
65
  logger = logging.getLogger(__name__)
63
66
 
64
67
 
@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
25
25
  def _convert_to_internal_request(
26
26
  self,
27
27
  request: ScoringRequest,
28
+ raw_request: Request = None,
28
29
  ) -> tuple[ScoringRequest, ScoringRequest]:
29
30
  """Convert OpenAI scoring request to internal format"""
30
31
  # For scoring, we pass the request directly as the tokenizer_manager