sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +89 -54
- sglang/bench_serving.py +437 -40
- sglang/lang/interpreter.py +1 -1
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +37 -7
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +6 -4
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -420
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +6 -4
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +94 -58
- sglang/srt/entrypoints/engine.py +34 -14
- sglang/srt/entrypoints/http_server.py +172 -47
- sglang/srt/entrypoints/openai/protocol.py +90 -27
- sglang/srt/entrypoints/openai/serving_base.py +6 -2
- sglang/srt/entrypoints/openai/serving_chat.py +82 -26
- sglang/srt/entrypoints/openai/serving_completions.py +25 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/eplb/eplb_manager.py +28 -4
- sglang/srt/eplb/expert_distribution.py +55 -15
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +28 -7
- sglang/srt/layers/activation.py +44 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +381 -136
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +11 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
- sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -8
- sglang/srt/layers/layernorm.py +54 -12
- sglang/srt/layers/logits_processor.py +10 -3
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +111 -56
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +43 -12
- sglang/srt/layers/moe/utils.py +6 -5
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
- sglang/srt/layers/quantization/fp8.py +78 -48
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +45 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +107 -40
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +93 -68
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +60 -42
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +83 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +28 -19
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/cache_controller.py +396 -365
- sglang/srt/managers/data_parallel_controller.py +30 -15
- sglang/srt/managers/detokenizer_manager.py +18 -2
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +190 -11
- sglang/srt/managers/mm_utils.py +6 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
- sglang/srt/managers/schedule_batch.py +27 -44
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +148 -122
- sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
- sglang/srt/managers/tokenizer_manager.py +77 -480
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +53 -40
- sglang/srt/mem_cache/hiradix_cache.py +196 -104
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +395 -53
- sglang/srt/mem_cache/memory_pool_host.py +27 -19
- sglang/srt/mem_cache/radix_cache.py +6 -6
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1 -3
- sglang/srt/metrics/collector.py +484 -63
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +72 -18
- sglang/srt/model_executor/model_runner.py +190 -32
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +33 -28
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/deepseek_v2.py +323 -53
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/gpt_oss.py +7 -19
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +17 -0
- sglang/srt/models/longcat_flash.py +1026 -0
- sglang/srt/models/longcat_flash_nextn.py +699 -0
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +33 -3
- sglang/srt/models/qwen2_5_vl.py +91 -42
- sglang/srt/models/qwen2_moe.py +79 -14
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/qwen3_next.py +1039 -0
- sglang/srt/models/qwen3_next_mtp.py +109 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/{conversation.py → parser/conversation.py} +38 -5
- sglang/srt/parser/harmony_parser.py +588 -0
- sglang/srt/parser/reasoning_parser.py +309 -0
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +307 -80
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_worker.py +216 -120
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +96 -7
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +181 -8
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_utils.py +25 -1
- sglang/utils.py +5 -0
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
- sglang/srt/disaggregation/launch_lb.py +0 -131
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- sglang/srt/reasoning_parser.py +0 -553
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/srt/entrypoints/engine.py
CHANGED
@@ -60,6 +60,7 @@ from sglang.srt.managers.io_struct import (
|
|
60
60
|
UpdateWeightsFromDistributedReqInput,
|
61
61
|
UpdateWeightsFromTensorReqInput,
|
62
62
|
)
|
63
|
+
from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
|
63
64
|
from sglang.srt.managers.scheduler import run_scheduler_process
|
64
65
|
from sglang.srt.managers.template_manager import TemplateManager
|
65
66
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
@@ -654,7 +655,8 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
654
655
|
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
655
656
|
os.environ["CUDA_MODULE_LOADING"] = "AUTO"
|
656
657
|
# flashinfer uses this environment variable for various kernels from MoE to quant kernels
|
657
|
-
os.environ
|
658
|
+
if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
|
659
|
+
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
658
660
|
|
659
661
|
# Can also be passed as argument
|
660
662
|
os.environ["SGLANG_RUN_ID"] = (
|
@@ -672,7 +674,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
672
674
|
if server_args.attention_backend == "flashinfer":
|
673
675
|
assert_pkg_version(
|
674
676
|
"flashinfer_python",
|
675
|
-
"0.
|
677
|
+
"0.3.1",
|
676
678
|
"Please uninstall the old version and "
|
677
679
|
"reinstall the latest version by following the instructions "
|
678
680
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -680,7 +682,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
680
682
|
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
|
681
683
|
assert_pkg_version(
|
682
684
|
"sgl-kernel",
|
683
|
-
"0.3.
|
685
|
+
"0.3.9.post2",
|
684
686
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
685
687
|
)
|
686
688
|
|
@@ -702,6 +704,24 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
702
704
|
mp.set_start_method("spawn", force=True)
|
703
705
|
|
704
706
|
|
707
|
+
def _init_tokenizer_manager(
|
708
|
+
server_args: ServerArgs, port_args: PortArgs
|
709
|
+
) -> TokenizerManager:
|
710
|
+
# Launch tokenizer process
|
711
|
+
tokenizer_manager = TokenizerManager(server_args, port_args)
|
712
|
+
|
713
|
+
# Initialize templates
|
714
|
+
template_manager = TemplateManager()
|
715
|
+
template_manager.initialize_templates(
|
716
|
+
tokenizer_manager=tokenizer_manager,
|
717
|
+
model_path=server_args.model_path,
|
718
|
+
chat_template=server_args.chat_template,
|
719
|
+
completion_template=server_args.completion_template,
|
720
|
+
)
|
721
|
+
|
722
|
+
return tokenizer_manager, template_manager
|
723
|
+
|
724
|
+
|
705
725
|
def _launch_subprocesses(
|
706
726
|
server_args: ServerArgs, port_args: Optional[PortArgs] = None
|
707
727
|
) -> Tuple[TokenizerManager, TemplateManager, Dict]:
|
@@ -815,17 +835,15 @@ def _launch_subprocesses(
|
|
815
835
|
)
|
816
836
|
detoken_proc.start()
|
817
837
|
|
818
|
-
#
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
tokenizer_manager=
|
825
|
-
|
826
|
-
|
827
|
-
completion_template=server_args.completion_template,
|
828
|
-
)
|
838
|
+
# Init tokenizer manager first, as the bootstrap server is initialized here
|
839
|
+
if server_args.tokenizer_worker_num > 1:
|
840
|
+
# Launch multi-tokenizer router
|
841
|
+
tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
|
842
|
+
template_manager = None
|
843
|
+
else:
|
844
|
+
tokenizer_manager, template_manager = _init_tokenizer_manager(
|
845
|
+
server_args, port_args
|
846
|
+
)
|
829
847
|
|
830
848
|
# Wait for the model to finish loading
|
831
849
|
scheduler_infos = []
|
@@ -848,5 +866,7 @@ def _launch_subprocesses(
|
|
848
866
|
|
849
867
|
# Assume all schedulers have the same scheduler_info
|
850
868
|
scheduler_info = scheduler_infos[0]
|
869
|
+
|
851
870
|
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
|
871
|
+
|
852
872
|
return tokenizer_manager, template_manager, scheduler_info
|
@@ -23,11 +23,14 @@ import json
|
|
23
23
|
import logging
|
24
24
|
import multiprocessing as multiprocessing
|
25
25
|
import os
|
26
|
+
import tempfile
|
26
27
|
import threading
|
27
28
|
import time
|
28
29
|
from http import HTTPStatus
|
29
30
|
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
|
30
31
|
|
32
|
+
import setproctitle
|
33
|
+
|
31
34
|
# Fix a bug of Python threading
|
32
35
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
33
36
|
|
@@ -44,11 +47,7 @@ from fastapi.exceptions import RequestValidationError
|
|
44
47
|
from fastapi.middleware.cors import CORSMiddleware
|
45
48
|
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
46
49
|
|
47
|
-
from sglang.srt.disaggregation.utils import
|
48
|
-
FAKE_BOOTSTRAP_HOST,
|
49
|
-
DisaggregationMode,
|
50
|
-
register_disaggregation_server,
|
51
|
-
)
|
50
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
52
51
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
53
52
|
from sglang.srt.entrypoints.openai.protocol import (
|
54
53
|
ChatCompletionRequest,
|
@@ -91,11 +90,18 @@ from sglang.srt.managers.io_struct import (
|
|
91
90
|
UpdateWeightVersionReqInput,
|
92
91
|
VertexGenerateReqInput,
|
93
92
|
)
|
93
|
+
from sglang.srt.managers.multi_tokenizer_mixin import (
|
94
|
+
MultiTokenizerManager,
|
95
|
+
get_main_process_id,
|
96
|
+
monkey_patch_uvicorn_multiprocessing,
|
97
|
+
read_from_shared_memory,
|
98
|
+
write_data_for_multi_tokenizer,
|
99
|
+
)
|
94
100
|
from sglang.srt.managers.template_manager import TemplateManager
|
95
101
|
from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
|
96
102
|
from sglang.srt.metrics.func_timer import enable_func_timer
|
97
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
98
|
-
from sglang.srt.server_args import ServerArgs
|
103
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
104
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
99
105
|
from sglang.srt.utils import (
|
100
106
|
add_api_key_middleware,
|
101
107
|
add_prometheus_middleware,
|
@@ -130,8 +136,72 @@ def set_global_state(global_state: _GlobalState):
|
|
130
136
|
_global_state = global_state
|
131
137
|
|
132
138
|
|
139
|
+
async def init_multi_tokenizer() -> ServerArgs:
|
140
|
+
"""Read args information from shm and init tokenizer manager for current process"""
|
141
|
+
pid = os.getpid()
|
142
|
+
main_pid = get_main_process_id()
|
143
|
+
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
144
|
+
|
145
|
+
# Read configuration from shared memory
|
146
|
+
port_args, server_args, scheduler_info = read_from_shared_memory(
|
147
|
+
f"multi_tokenizer_args_{main_pid}"
|
148
|
+
)
|
149
|
+
server_args: ServerArgs
|
150
|
+
|
151
|
+
# API key authentication is not supported in multi-tokenizer mode
|
152
|
+
assert (
|
153
|
+
server_args.api_key is None
|
154
|
+
), "API key is not supported in multi-tokenizer mode"
|
155
|
+
|
156
|
+
port_args.tokenizer_ipc_name = (
|
157
|
+
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
158
|
+
)
|
159
|
+
|
160
|
+
# Launch multi-tokenizer manager process
|
161
|
+
tokenizer_manager = MultiTokenizerManager(server_args, port_args)
|
162
|
+
template_manager = TemplateManager()
|
163
|
+
template_manager.initialize_templates(
|
164
|
+
tokenizer_manager=tokenizer_manager,
|
165
|
+
model_path=server_args.model_path,
|
166
|
+
chat_template=server_args.chat_template,
|
167
|
+
completion_template=server_args.completion_template,
|
168
|
+
)
|
169
|
+
# Register this tokenizer with the main tokenizer manager
|
170
|
+
await tokenizer_manager.register_to_main_tokenizer_manager()
|
171
|
+
|
172
|
+
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
|
173
|
+
set_global_state(
|
174
|
+
_GlobalState(
|
175
|
+
tokenizer_manager=tokenizer_manager,
|
176
|
+
template_manager=template_manager,
|
177
|
+
scheduler_info=scheduler_info,
|
178
|
+
)
|
179
|
+
)
|
180
|
+
return server_args
|
181
|
+
|
182
|
+
|
133
183
|
@asynccontextmanager
|
134
184
|
async def lifespan(fast_api_app: FastAPI):
|
185
|
+
if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
|
186
|
+
# Initialize multi-tokenizer support for worker processes
|
187
|
+
fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
|
188
|
+
|
189
|
+
# only metrics middleware is supported in multi-tokenizer mode
|
190
|
+
worker_pid = os.getpid()
|
191
|
+
if fast_api_app.server_args.enable_metrics:
|
192
|
+
add_prometheus_middleware(app)
|
193
|
+
enable_func_timer()
|
194
|
+
|
195
|
+
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
196
|
+
fast_api_app.warmup_thread = threading.Thread(
|
197
|
+
target=_wait_and_warmup,
|
198
|
+
args=(
|
199
|
+
fast_api_app.server_args,
|
200
|
+
None, # pipe_finish_writer not needed in worker
|
201
|
+
None, # launch_callback not needed in worker
|
202
|
+
),
|
203
|
+
)
|
204
|
+
|
135
205
|
# Initialize OpenAI serving handlers
|
136
206
|
fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
|
137
207
|
_global_state.tokenizer_manager, _global_state.template_manager
|
@@ -191,7 +261,15 @@ async def lifespan(fast_api_app: FastAPI):
|
|
191
261
|
warmup_thread = getattr(fast_api_app, "warmup_thread", None)
|
192
262
|
if warmup_thread is not None:
|
193
263
|
warmup_thread.start()
|
194
|
-
|
264
|
+
|
265
|
+
try:
|
266
|
+
yield
|
267
|
+
finally:
|
268
|
+
if server_args.tokenizer_worker_num > 1:
|
269
|
+
pid = os.getpid()
|
270
|
+
logger.info(f"uvicorn worker {pid} ending...")
|
271
|
+
warmup_thread.join()
|
272
|
+
logger.info(f"uvicorn worker {pid} ended.")
|
195
273
|
|
196
274
|
|
197
275
|
# Fast API
|
@@ -480,6 +558,16 @@ async def flush_cache():
|
|
480
558
|
)
|
481
559
|
|
482
560
|
|
561
|
+
@app.api_route("/clear_hicache_storage_backend", methods=["GET", "POST"])
|
562
|
+
async def clear_hicache_storage_backend():
|
563
|
+
"""Clear the hierarchical cache storage backend."""
|
564
|
+
ret = await _global_state.tokenizer_manager.clear_hicache_storage()
|
565
|
+
return Response(
|
566
|
+
content="Hierarchical cache storage backend cleared.\n",
|
567
|
+
status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
|
568
|
+
)
|
569
|
+
|
570
|
+
|
483
571
|
@app.api_route("/start_profile", methods=["GET", "POST"])
|
484
572
|
async def start_profile_async(obj: Optional[ProfileReqInput] = None):
|
485
573
|
"""Start profiling."""
|
@@ -1068,9 +1156,21 @@ def launch_server(
|
|
1068
1156
|
1. The HTTP server, Engine, and TokenizerManager both run in the main process.
|
1069
1157
|
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
1070
1158
|
"""
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1159
|
+
if server_args.tokenizer_worker_num > 1:
|
1160
|
+
setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
|
1161
|
+
port_args = PortArgs.init_new(server_args)
|
1162
|
+
port_args.tokenizer_worker_ipc_name = (
|
1163
|
+
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
1164
|
+
)
|
1165
|
+
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1166
|
+
server_args=server_args, port_args=port_args
|
1167
|
+
)
|
1168
|
+
else:
|
1169
|
+
setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
|
1170
|
+
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1171
|
+
server_args=server_args,
|
1172
|
+
)
|
1173
|
+
|
1074
1174
|
set_global_state(
|
1075
1175
|
_GlobalState(
|
1076
1176
|
tokenizer_manager=tokenizer_manager,
|
@@ -1079,42 +1179,75 @@ def launch_server(
|
|
1079
1179
|
)
|
1080
1180
|
)
|
1081
1181
|
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
# Add prometheus middleware
|
1087
|
-
if server_args.enable_metrics:
|
1088
|
-
add_prometheus_middleware(app)
|
1089
|
-
enable_func_timer()
|
1090
|
-
|
1091
|
-
# Send a warmup request - we will create the thread launch it
|
1092
|
-
# in the lifespan after all other warmups have fired.
|
1093
|
-
warmup_thread = threading.Thread(
|
1094
|
-
target=_wait_and_warmup,
|
1095
|
-
args=(
|
1182
|
+
if server_args.tokenizer_worker_num > 1:
|
1183
|
+
multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
|
1184
|
+
port_args,
|
1096
1185
|
server_args,
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1186
|
+
scheduler_info,
|
1187
|
+
)
|
1188
|
+
else:
|
1189
|
+
# Add api key authorization
|
1190
|
+
if server_args.api_key:
|
1191
|
+
add_api_key_middleware(app, server_args.api_key)
|
1192
|
+
|
1193
|
+
# Add prometheus middleware
|
1194
|
+
if server_args.enable_metrics:
|
1195
|
+
add_prometheus_middleware(app)
|
1196
|
+
enable_func_timer()
|
1197
|
+
|
1198
|
+
# Send a warmup request - we will create the thread launch it
|
1199
|
+
# in the lifespan after all other warmups have fired.
|
1200
|
+
warmup_thread = threading.Thread(
|
1201
|
+
target=_wait_and_warmup,
|
1202
|
+
args=(
|
1203
|
+
server_args,
|
1204
|
+
pipe_finish_writer,
|
1205
|
+
launch_callback,
|
1206
|
+
),
|
1207
|
+
)
|
1208
|
+
app.warmup_thread = warmup_thread
|
1102
1209
|
|
1103
1210
|
try:
|
1104
1211
|
# Update logging configs
|
1105
1212
|
set_uvicorn_logging_configs()
|
1106
1213
|
app.server_args = server_args
|
1107
1214
|
# Listen for HTTP requests
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1215
|
+
if server_args.tokenizer_worker_num > 1:
|
1216
|
+
from uvicorn.config import LOGGING_CONFIG
|
1217
|
+
|
1218
|
+
LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
|
1219
|
+
"handlers": ["default"],
|
1220
|
+
"level": "INFO",
|
1221
|
+
"propagate": False,
|
1222
|
+
}
|
1223
|
+
|
1224
|
+
monkey_patch_uvicorn_multiprocessing()
|
1225
|
+
|
1226
|
+
uvicorn.run(
|
1227
|
+
"sglang.srt.entrypoints.http_server:app",
|
1228
|
+
host=server_args.host,
|
1229
|
+
port=server_args.port,
|
1230
|
+
log_level=server_args.log_level_http or server_args.log_level,
|
1231
|
+
timeout_keep_alive=5,
|
1232
|
+
loop="uvloop",
|
1233
|
+
workers=server_args.tokenizer_worker_num,
|
1234
|
+
)
|
1235
|
+
else:
|
1236
|
+
app.is_single_tokenizer_mode = True
|
1237
|
+
uvicorn.run(
|
1238
|
+
app,
|
1239
|
+
host=server_args.host,
|
1240
|
+
port=server_args.port,
|
1241
|
+
log_level=server_args.log_level_http or server_args.log_level,
|
1242
|
+
timeout_keep_alive=5,
|
1243
|
+
loop="uvloop",
|
1244
|
+
)
|
1116
1245
|
finally:
|
1117
|
-
|
1246
|
+
if server_args.tokenizer_worker_num > 1:
|
1247
|
+
multi_tokenizer_args_shm.unlink()
|
1248
|
+
_global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
|
1249
|
+
else:
|
1250
|
+
warmup_thread.join()
|
1118
1251
|
|
1119
1252
|
|
1120
1253
|
def _execute_server_warmup(
|
@@ -1261,13 +1394,5 @@ def _wait_and_warmup(
|
|
1261
1394
|
if server_args.debug_tensor_dump_input_file:
|
1262
1395
|
kill_process_tree(os.getpid())
|
1263
1396
|
|
1264
|
-
if server_args.pdlb_url is not None:
|
1265
|
-
register_disaggregation_server(
|
1266
|
-
server_args.disaggregation_mode,
|
1267
|
-
server_args.port,
|
1268
|
-
server_args.disaggregation_bootstrap_port,
|
1269
|
-
server_args.pdlb_url,
|
1270
|
-
)
|
1271
|
-
|
1272
1397
|
if launch_callback is not None:
|
1273
1398
|
launch_callback()
|
@@ -35,6 +35,8 @@ from pydantic import (
|
|
35
35
|
)
|
36
36
|
from typing_extensions import Literal
|
37
37
|
|
38
|
+
DEFAULT_MODEL_NAME = "default"
|
39
|
+
|
38
40
|
|
39
41
|
class ModelCard(BaseModel):
|
40
42
|
"""Model cards."""
|
@@ -108,6 +110,23 @@ class JsonSchemaResponseFormat(BaseModel):
|
|
108
110
|
strict: Optional[bool] = False
|
109
111
|
|
110
112
|
|
113
|
+
class ResponseFormat(BaseModel):
|
114
|
+
type: Literal["text", "json_object", "json_schema"]
|
115
|
+
json_schema: Optional[JsonSchemaResponseFormat] = None
|
116
|
+
|
117
|
+
|
118
|
+
class StructuresResponseFormat(BaseModel):
|
119
|
+
begin: str
|
120
|
+
schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
|
121
|
+
end: str
|
122
|
+
|
123
|
+
|
124
|
+
class StructuralTagResponseFormat(BaseModel):
|
125
|
+
type: Literal["structural_tag"]
|
126
|
+
structures: List[StructuresResponseFormat]
|
127
|
+
triggers: List[str]
|
128
|
+
|
129
|
+
|
111
130
|
class FileRequest(BaseModel):
|
112
131
|
# https://platform.openai.com/docs/api-reference/files/create
|
113
132
|
file: bytes # The File object (not file name) to be uploaded
|
@@ -166,7 +185,7 @@ class BatchResponse(BaseModel):
|
|
166
185
|
class CompletionRequest(BaseModel):
|
167
186
|
# Ordered by official OpenAI API documentation
|
168
187
|
# https://platform.openai.com/docs/api-reference/completions/create
|
169
|
-
model: str
|
188
|
+
model: str = DEFAULT_MODEL_NAME
|
170
189
|
prompt: Union[List[int], List[List[int]], str, List[str]]
|
171
190
|
best_of: Optional[int] = None
|
172
191
|
echo: bool = False
|
@@ -200,6 +219,7 @@ class CompletionRequest(BaseModel):
|
|
200
219
|
skip_special_tokens: bool = True
|
201
220
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
202
221
|
session_params: Optional[Dict] = None
|
222
|
+
response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
|
203
223
|
|
204
224
|
# For PD disaggregation
|
205
225
|
bootstrap_host: Optional[Union[List[str], str]] = None
|
@@ -327,7 +347,7 @@ class ToolCall(BaseModel):
|
|
327
347
|
|
328
348
|
|
329
349
|
class ChatCompletionMessageGenericParam(BaseModel):
|
330
|
-
role: Literal["system", "assistant", "tool"]
|
350
|
+
role: Literal["system", "assistant", "tool", "function"]
|
331
351
|
content: Union[str, List[ChatCompletionMessageContentTextPart], None] = Field(
|
332
352
|
default=None
|
333
353
|
)
|
@@ -341,9 +361,9 @@ class ChatCompletionMessageGenericParam(BaseModel):
|
|
341
361
|
def _normalize_role(cls, v):
|
342
362
|
if isinstance(v, str):
|
343
363
|
v_lower = v.lower()
|
344
|
-
if v_lower not in {"system", "assistant", "tool"}:
|
364
|
+
if v_lower not in {"system", "assistant", "tool", "function"}:
|
345
365
|
raise ValueError(
|
346
|
-
"'role' must be one of 'system', 'assistant', or '
|
366
|
+
"'role' must be one of 'system', 'assistant', 'tool', or 'function' (case-insensitive)."
|
347
367
|
)
|
348
368
|
return v_lower
|
349
369
|
raise ValueError("'role' must be a string")
|
@@ -359,23 +379,6 @@ ChatCompletionMessageParam = Union[
|
|
359
379
|
]
|
360
380
|
|
361
381
|
|
362
|
-
class ResponseFormat(BaseModel):
|
363
|
-
type: Literal["text", "json_object", "json_schema"]
|
364
|
-
json_schema: Optional[JsonSchemaResponseFormat] = None
|
365
|
-
|
366
|
-
|
367
|
-
class StructuresResponseFormat(BaseModel):
|
368
|
-
begin: str
|
369
|
-
schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
|
370
|
-
end: str
|
371
|
-
|
372
|
-
|
373
|
-
class StructuralTagResponseFormat(BaseModel):
|
374
|
-
type: Literal["structural_tag"]
|
375
|
-
structures: List[StructuresResponseFormat]
|
376
|
-
triggers: List[str]
|
377
|
-
|
378
|
-
|
379
382
|
class Function(BaseModel):
|
380
383
|
"""Function descriptions."""
|
381
384
|
|
@@ -409,7 +412,7 @@ class ChatCompletionRequest(BaseModel):
|
|
409
412
|
# Ordered by official OpenAI API documentation
|
410
413
|
# https://platform.openai.com/docs/api-reference/chat/create
|
411
414
|
messages: List[ChatCompletionMessageParam]
|
412
|
-
model: str
|
415
|
+
model: str = DEFAULT_MODEL_NAME
|
413
416
|
frequency_penalty: float = 0.0
|
414
417
|
logit_bias: Optional[Dict[str, float]] = None
|
415
418
|
logprobs: bool = False
|
@@ -457,6 +460,66 @@ class ChatCompletionRequest(BaseModel):
|
|
457
460
|
values["tool_choice"] = "auto"
|
458
461
|
return values
|
459
462
|
|
463
|
+
@model_validator(mode="before")
|
464
|
+
@classmethod
|
465
|
+
def normalize_reasoning_inputs(cls, values: Dict):
|
466
|
+
r = values.get("reasoning")
|
467
|
+
if r is None:
|
468
|
+
return values
|
469
|
+
|
470
|
+
if isinstance(r, dict):
|
471
|
+
effort = r.get("effort") or r.get("reasoning_effort")
|
472
|
+
if effort in {"low", "medium", "high"}:
|
473
|
+
values["reasoning_effort"] = effort
|
474
|
+
|
475
|
+
enabled = (
|
476
|
+
r.get("enabled")
|
477
|
+
if r.get("enabled") is not None
|
478
|
+
else r.get("enable", False)
|
479
|
+
)
|
480
|
+
if isinstance(enabled, str):
|
481
|
+
enabled = enabled.strip().lower() in {"1", "true", "yes", "y", "on"}
|
482
|
+
if enabled:
|
483
|
+
ctk = values.get("chat_template_kwargs")
|
484
|
+
if not isinstance(ctk, dict):
|
485
|
+
ctk = {}
|
486
|
+
ctk.setdefault("thinking", True)
|
487
|
+
values["chat_template_kwargs"] = ctk
|
488
|
+
|
489
|
+
return values
|
490
|
+
|
491
|
+
@model_validator(mode="before")
|
492
|
+
@classmethod
|
493
|
+
def set_json_schema(cls, values):
|
494
|
+
response_format = values.get("response_format")
|
495
|
+
if not response_format:
|
496
|
+
return values
|
497
|
+
|
498
|
+
if response_format.get("type") != "json_schema":
|
499
|
+
return values
|
500
|
+
|
501
|
+
schema = response_format.pop("schema", None)
|
502
|
+
json_schema = response_format.get("json_schema")
|
503
|
+
|
504
|
+
if json_schema:
|
505
|
+
return values
|
506
|
+
|
507
|
+
if schema:
|
508
|
+
name_ = schema.get("title", "Schema")
|
509
|
+
strict_ = False
|
510
|
+
if "properties" in schema and "strict" in schema["properties"]:
|
511
|
+
item = schema["properties"].pop("strict", None)
|
512
|
+
if item and item.get("default", False):
|
513
|
+
strict_ = True
|
514
|
+
|
515
|
+
response_format["json_schema"] = {
|
516
|
+
"name": name_,
|
517
|
+
"schema": schema,
|
518
|
+
"strict": strict_,
|
519
|
+
}
|
520
|
+
|
521
|
+
return values
|
522
|
+
|
460
523
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
461
524
|
top_k: int = -1
|
462
525
|
min_p: float = 0.0
|
@@ -479,9 +542,9 @@ class ChatCompletionRequest(BaseModel):
|
|
479
542
|
rid: Optional[Union[List[str], str]] = None
|
480
543
|
|
481
544
|
# For PD disaggregation
|
482
|
-
bootstrap_host: Optional[str] = None
|
483
|
-
bootstrap_port: Optional[int] = None
|
484
|
-
bootstrap_room: Optional[int] = None
|
545
|
+
bootstrap_host: Optional[Union[List[str], str]] = None
|
546
|
+
bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
|
547
|
+
bootstrap_room: Optional[Union[List[int], int]] = None
|
485
548
|
|
486
549
|
|
487
550
|
class ChatMessage(BaseModel):
|
@@ -571,7 +634,7 @@ class EmbeddingRequest(BaseModel):
|
|
571
634
|
# Ordered by official OpenAI API documentation
|
572
635
|
# https://platform.openai.com/docs/api-reference/embeddings/create
|
573
636
|
input: EmbeddingInput
|
574
|
-
model: str
|
637
|
+
model: str = DEFAULT_MODEL_NAME
|
575
638
|
encoding_format: str = "float"
|
576
639
|
dimensions: Optional[int] = None
|
577
640
|
user: Optional[str] = None
|
@@ -605,7 +668,7 @@ class ScoringRequest(BaseModel):
|
|
605
668
|
)
|
606
669
|
apply_softmax: bool = False
|
607
670
|
item_first: bool = False
|
608
|
-
model: str
|
671
|
+
model: str = DEFAULT_MODEL_NAME
|
609
672
|
|
610
673
|
|
611
674
|
class ScoringResponse(BaseModel):
|
@@ -1,15 +1,19 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import json
|
2
4
|
import logging
|
3
5
|
import uuid
|
4
6
|
from abc import ABC, abstractmethod
|
5
|
-
from typing import Any, Optional, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
6
8
|
|
7
9
|
from fastapi import HTTPException, Request
|
8
10
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
9
11
|
|
10
12
|
from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
|
11
13
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
12
|
-
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
13
17
|
|
14
18
|
logger = logging.getLogger(__name__)
|
15
19
|
|