PyPI - sglang - Versions diffs - 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl - Mend

sglang 0.5.2rc1py3-none-any.whl → 0.5.3rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (265) hide show

sglang/bench_one_batch_server.py +10 -1
sglang/bench_serving.py +257 -29
sglang/lang/interpreter.py +1 -1
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +50 -6
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +48 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/xgrammar_backend.py +28 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +11 -3
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +15 -12
sglang/srt/disaggregation/decode.py +21 -10
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +18 -10
sglang/srt/disaggregation/nixl/conn.py +180 -16
sglang/srt/disaggregation/prefill.py +5 -3
sglang/srt/disaggregation/utils.py +5 -50
sglang/srt/distributed/parallel_state.py +67 -43
sglang/srt/entrypoints/engine.py +38 -17
sglang/srt/entrypoints/grpc_request_manager.py +580 -0
sglang/srt/entrypoints/grpc_server.py +680 -0
sglang/srt/entrypoints/http_server.py +88 -53
sglang/srt/entrypoints/openai/protocol.py +7 -4
sglang/srt/entrypoints/openai/serving_base.py +46 -3
sglang/srt/entrypoints/openai/serving_chat.py +39 -19
sglang/srt/entrypoints/openai/serving_completions.py +15 -4
sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +7 -4
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +8 -3
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +6 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
sglang/srt/hf_transformers_utils.py +4 -0
sglang/srt/layers/activation.py +142 -9
sglang/srt/layers/attention/aiter_backend.py +93 -68
sglang/srt/layers/attention/ascend_backend.py +11 -4
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashinfer_backend.py +6 -4
sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
sglang/srt/layers/attention/mamba/mamba.py +64 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +18 -1
sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +45 -7
sglang/srt/layers/dp_attention.py +30 -1
sglang/srt/layers/layernorm.py +32 -15
sglang/srt/layers/linear.py +34 -3
sglang/srt/layers/logits_processor.py +29 -10
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +182 -62
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +12 -7
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +76 -47
sglang/srt/layers/quantization/fp8_utils.py +50 -31
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +182 -49
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +68 -41
sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +30 -17
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +76 -38
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +0 -18
sglang/srt/layers/sampler.py +162 -18
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/triton_backend.py +90 -2
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +4 -1
sglang/srt/lora/lora_manager.py +35 -112
sglang/srt/lora/mem_pool.py +24 -10
sglang/srt/lora/utils.py +18 -9
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +200 -199
sglang/srt/managers/data_parallel_controller.py +105 -35
sglang/srt/managers/detokenizer_manager.py +8 -4
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +199 -12
sglang/srt/managers/mm_utils.py +1 -0
sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
sglang/srt/managers/schedule_batch.py +77 -56
sglang/srt/managers/schedule_policy.py +4 -3
sglang/srt/managers/scheduler.py +191 -139
sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
sglang/srt/managers/tokenizer_manager.py +260 -519
sglang/srt/managers/tp_worker.py +53 -4
sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
sglang/srt/mem_cache/allocator.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +18 -33
sglang/srt/mem_cache/hiradix_cache.py +108 -48
sglang/srt/mem_cache/memory_pool.py +347 -48
sglang/srt/mem_cache/memory_pool_host.py +121 -57
sglang/srt/mem_cache/radix_cache.py +0 -2
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/mem_cache/swa_radix_cache.py +0 -2
sglang/srt/metrics/collector.py +502 -77
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +48 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +13 -5
sglang/srt/model_executor/forward_batch_info.py +75 -19
sglang/srt/model_executor/model_runner.py +357 -30
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +128 -4
sglang/srt/model_loader/weight_utils.py +2 -1
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +798 -218
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_v2.py +346 -48
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +11 -2
sglang/srt/models/glm4v.py +4 -2
sglang/srt/models/glm4v_moe.py +3 -0
sglang/srt/models/gpt_oss.py +1 -1
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/mllama4.py +25 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2.py +7 -0
sglang/srt/models/qwen2_5_vl.py +27 -3
sglang/srt/models/qwen2_moe.py +60 -13
sglang/srt/models/qwen3.py +8 -2
sglang/srt/models/qwen3_moe.py +40 -9
sglang/srt/models/qwen3_next.py +1042 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/multimodal/processors/dots_vlm.py +99 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +141 -129
sglang/srt/multimodal/processors/qwen_vl.py +15 -5
sglang/srt/offloader.py +27 -3
sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
sglang/srt/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/sampling/sampling_batch_info.py +18 -15
sglang/srt/server_args.py +355 -37
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
sglang/srt/speculative/eagle_utils.py +0 -2
sglang/srt/speculative/eagle_worker.py +197 -112
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/tracing/trace.py +552 -0
sglang/srt/utils.py +46 -3
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/runners.py +4 -0
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_disaggregation_utils.py +66 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_utils.py +28 -1
sglang/utils.py +12 -0
sglang/version.py +1 -1
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
sglang/srt/disaggregation/launch_lb.py +0 -118
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{conversation.py → parser/conversation.py} +0 -0
/sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
/sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -31,18 +31,7 @@ from contextlib import nullcontext
 from datetime import datetime
 from enum import Enum
 from http import HTTPStatus
-from typing import (
-    Any,
-    Awaitable,
-    Deque,
-    Dict,
-    Generic,
-    List,
-    Optional,
-    Tuple,
-    TypeVar,
-    Union,
-)
+from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
 import fastapi
 import torch
@@ -53,18 +42,15 @@ from fastapi import BackgroundTasks
 from sglang.srt.aio_rwlock import RWLock
 from sglang.srt.configs.model_config import ModelConfig
-from sglang.srt.disaggregation.utils import (
-    DisaggregationMode,
-    KVClassType,
-    TransferBackend,
-    get_kv_class,
-)
+from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.hf_transformers_utils import (
     get_processor,
     get_tokenizer,
     get_tokenizer_from_processor,
 )
 from sglang.srt.lora.lora_registry import LoRARef, LoRARegistry
+from sglang.srt.managers.async_dynamic_batch_tokenizer import AsyncDynamicbatchTokenizer
+from sglang.srt.managers.disagg_service import start_disagg_service
 from sglang.srt.managers.io_struct import (
     AbortReq,
     BatchEmbeddingOut,
@@ -73,60 +59,38 @@ from sglang.srt.managers.io_struct import (
     BatchTokenIDOut,
     BatchTokenizedEmbeddingReqInput,
     BatchTokenizedGenerateReqInput,
-    ClearHiCacheReqInput,
-    ClearHiCacheReqOutput,
     CloseSessionReqInput,
     ConfigureLoggingReq,
     EmbeddingReqInput,
-    ExpertDistributionReq,
-    ExpertDistributionReqOutput,
-    FlushCacheReqInput,
-    FlushCacheReqOutput,
     FreezeGCReq,
     GenerateReqInput,
-    GetInternalStateReq,
-    GetInternalStateReqOutput,
-    GetWeightsByNameReqInput,
-    GetWeightsByNameReqOutput,
+    GetLoadReqInput,
     HealthCheckOutput,
-    InitWeightsUpdateGroupReqInput,
-    InitWeightsUpdateGroupReqOutput,
-    LoadLoRAAdapterReqInput,
-    LoadLoRAAdapterReqOutput,
-    LoRAUpdateResult,
-    MultiTokenizerWarpper,
+    MultiTokenizerWrapper,
     OpenSessionReqInput,
     OpenSessionReqOutput,
-    ProfileReq,
-    ProfileReqOutput,
-    ProfileReqType,
-    ReleaseMemoryOccupationReqInput,
-    ReleaseMemoryOccupationReqOutput,
-    ResumeMemoryOccupationReqInput,
-    ResumeMemoryOccupationReqOutput,
     SessionParams,
-    SetInternalStateReq,
-    SetInternalStateReqOutput,
-    SlowDownReqInput,
-    SlowDownReqOutput,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
-    UnloadLoRAAdapterReqInput,
-    UnloadLoRAAdapterReqOutput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightFromDiskReqOutput,
-    UpdateWeightsFromDistributedReqInput,
-    UpdateWeightsFromDistributedReqOutput,
-    UpdateWeightsFromTensorReqInput,
-    UpdateWeightsFromTensorReqOutput,
+    WatchLoadUpdateReq,
 )
 from sglang.srt.managers.mm_utils import TensorTransportMode
 from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors
 from sglang.srt.managers.scheduler import is_health_check_generate_req
 from sglang.srt.managers.scheduler_input_blocker import input_blocker_guard_region
+from sglang.srt.managers.tokenizer_communicator_mixin import TokenizerCommunicatorMixin
 from sglang.srt.metrics.collector import TokenizerMetricsCollector
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.tracing.trace import (
+    trace_get_proc_propagate_context,
+    trace_req_finish,
+    trace_req_start,
+    trace_slice_end,
+    trace_slice_start,
+)
 from sglang.srt.utils import (
     configure_gc_warning,
     dataclass_to_string_truncated,
@@ -180,7 +144,7 @@ class ReqState:
     output_token_ids_logprobs_idx: List = dataclasses.field(default_factory=list)
-class TokenizerManager:
+class TokenizerManager(TokenizerCommunicatorMixin):
     """TokenizerManager is a process that tokenizes the text."""
     def __init__(
@@ -262,6 +226,18 @@ class TokenizerManager:
                     trust_remote_code=server_args.trust_remote_code,
                     revision=server_args.revision,
                 )
+        # Initialize async dynamic batch tokenizer if enabled (common for both multimodal and non-multimodal)
+        if (
+            server_args.enable_dynamic_batch_tokenizer
+            and not server_args.skip_tokenizer_init
+        ):
+            self.async_dynamic_batch_tokenizer = AsyncDynamicbatchTokenizer(
+                self.tokenizer,
+                max_batch_size=server_args.dynamic_batch_tokenizer_batch_size,
+                batch_wait_timeout_s=server_args.dynamic_batch_tokenizer_batch_timeout,
+            )
+        else:
+            self.async_dynamic_batch_tokenizer = None
         # Init inter-process communication
         context = zmq.asyncio.Context(2)
@@ -319,8 +295,10 @@ class TokenizerManager:
         # LoRA updates and inference to overlap.
         self.lora_update_lock = asyncio.Lock()
-        # For PD disaggregtion
-        self.init_disaggregation()
+        self.disaggregation_mode = DisaggregationMode(
+            self.server_args.disaggregation_mode
+        )
+        self.bootstrap_server = start_disagg_service(self.server_args)
         # For load balancing
         self.current_load = 0
@@ -328,11 +306,16 @@ class TokenizerManager:
         # Metrics
         if self.enable_metrics:
+            labels = {
+                "model_name": self.server_args.served_model_name,
+                # TODO: Add lora name/path in the future,
+            }
+            if server_args.tokenizer_metrics_allowed_customer_labels:
+                for label in server_args.tokenizer_metrics_allowed_customer_labels:
+                    labels[label] = ""
             self.metrics_collector = TokenizerMetricsCollector(
-                labels={
-                    "model_name": self.server_args.served_model_name,
-                    # TODO: Add lora name/path in the future,
-                },
+                server_args=server_args,
+                labels=labels,
                 bucket_time_to_first_token=self.server_args.bucket_time_to_first_token,
                 bucket_e2e_request_latency=self.server_args.bucket_e2e_request_latency,
                 bucket_inter_token_latency=self.server_args.bucket_inter_token_latency,
@@ -343,50 +326,6 @@ class TokenizerManager:
         if self.server_args.gc_warning_threshold_secs > 0.0:
             configure_gc_warning(self.server_args.gc_warning_threshold_secs)
-        # Communicators
-        self.init_weights_update_group_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.update_weights_from_distributed_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.update_weights_from_tensor_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.get_weights_by_name_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.release_memory_occupation_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.resume_memory_occupation_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.slow_down_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.flush_cache_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.clear_hicache_storage_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.profile_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.get_internal_state_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.set_internal_state_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.expert_distribution_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
-        self.update_lora_adapter_communicator = _Communicator(
-            self.send_to_scheduler, server_args.dp_size
-        )
         self._result_dispatcher = TypeBasedDispatcher(
             [
                 (
@@ -404,100 +343,15 @@ class TokenizerManager:
                     UpdateWeightFromDiskReqOutput,
                     self._handle_update_weights_from_disk_req_output,
                 ),
-                (
-                    InitWeightsUpdateGroupReqOutput,
-                    self.init_weights_update_group_communicator.handle_recv,
-                ),
-                (
-                    UpdateWeightsFromDistributedReqOutput,
-                    self.update_weights_from_distributed_communicator.handle_recv,
-                ),
-                (
-                    UpdateWeightsFromTensorReqOutput,
-                    self.update_weights_from_tensor_communicator.handle_recv,
-                ),
-                (
-                    GetWeightsByNameReqOutput,
-                    self.get_weights_by_name_communicator.handle_recv,
-                ),
-                (
-                    ReleaseMemoryOccupationReqOutput,
-                    self.release_memory_occupation_communicator.handle_recv,
-                ),
-                (
-                    ResumeMemoryOccupationReqOutput,
-                    self.resume_memory_occupation_communicator.handle_recv,
-                ),
-                (
-                    SlowDownReqOutput,
-                    self.slow_down_communicator.handle_recv,
-                ),
-                (
-                    ClearHiCacheReqOutput,
-                    self.clear_hicache_storage_communicator.handle_recv,
-                ),
-                (
-                    FlushCacheReqOutput,
-                    self.flush_cache_communicator.handle_recv,
-                ),
-                (
-                    ProfileReqOutput,
-                    self.profile_communicator.handle_recv,
-                ),
                 (
                     FreezeGCReq,
                     lambda x: None,
                 ),  # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it.
-                (
-                    GetInternalStateReqOutput,
-                    self.get_internal_state_communicator.handle_recv,
-                ),
-                (
-                    SetInternalStateReqOutput,
-                    self.set_internal_state_communicator.handle_recv,
-                ),
-                (
-                    ExpertDistributionReqOutput,
-                    self.expert_distribution_communicator.handle_recv,
-                ),
-                (
-                    LoRAUpdateResult,
-                    self.update_lora_adapter_communicator.handle_recv,
-                ),
                 (HealthCheckOutput, lambda x: None),
             ]
         )
-    def init_disaggregation(self):
-        self.disaggregation_mode = DisaggregationMode(
-            self.server_args.disaggregation_mode
-        )
-        self.disaggregation_transfer_backend = TransferBackend(
-            self.server_args.disaggregation_transfer_backend
-        )
-        # Start kv boostrap server on prefill
-        if self.disaggregation_mode == DisaggregationMode.PREFILL:
-            # only start bootstrap server on prefill tm
-            kv_bootstrap_server_class = get_kv_class(
-                self.disaggregation_transfer_backend, KVClassType.BOOTSTRAP_SERVER
-            )
-            self.bootstrap_server = kv_bootstrap_server_class(
-                self.server_args.disaggregation_bootstrap_port
-            )
-            is_create_store = (
-                self.server_args.node_rank == 0
-                and self.server_args.disaggregation_transfer_backend == "ascend"
-            )
-            if is_create_store:
-                try:
-                    from mf_adapter import create_config_store
-                    ascend_url = os.getenv("ASCEND_MF_STORE_URL")
-                    create_config_store(ascend_url)
-                except Exception as e:
-                    error_message = f"Failed create mf store, invalid ascend_url."
-                    error_message += f" With exception {e}"
-                    raise error_message
+        self.init_communicators(server_args)
     async def generate_request(
         self,
@@ -517,6 +371,24 @@ class TokenizerManager:
                 # If it's a single value, add worker_id prefix
                 obj.rid = f"{self.worker_id}_{obj.rid}"
+        if obj.is_single:
+            bootstrap_room = (
+                obj.bootstrap_room if hasattr(obj, "bootstrap_room") else None
+            )
+            trace_req_start(obj.rid, bootstrap_room, ts=int(created_time * 1e9))
+            trace_slice_start("", obj.rid, ts=int(created_time * 1e9), anonymous=True)
+        else:
+            for i in range(len(obj.rid)):
+                bootstrap_room = (
+                    obj.bootstrap_room[i]
+                    if hasattr(obj, "bootstrap_room") and obj.bootstrap_room
+                    else None
+                )
+                trace_req_start(obj.rid[i], bootstrap_room, ts=int(created_time * 1e9))
+                trace_slice_start(
+                    "", obj.rid[i], ts=int(created_time * 1e9), anonymous=True
+                )
         if self.log_requests:
             max_length, skip_names, _ = self.log_request_metadata
             logger.info(
@@ -542,6 +414,144 @@ class TokenizerManager:
                 ):
                     yield response
+    def _detect_input_format(
+        self, texts: Union[str, List[str]], is_cross_encoder: bool
+    ) -> str:
+        """Detect the format of input texts for proper tokenization handling.
+        Returns:
+            - "single_string": Regular single text like "Hello world"
+            - "batch_strings": Regular batch like ["Hello", "World"]
+            - "cross_encoder_pairs": Cross-encoder pairs like [["query", "document"]]
+        """
+        if isinstance(texts, str):
+            return "single_string"
+        if (
+            is_cross_encoder
+            and len(texts) > 0
+            and isinstance(texts[0], list)
+            and len(texts[0]) == 2
+        ):
+            return "cross_encoder_pairs"
+        return "batch_strings"
+    def _prepare_tokenizer_input(
+        self, texts: Union[str, List[str]], input_format: str
+    ) -> Union[List[str], List[List[str]]]:
+        """Prepare input for the tokenizer based on detected format."""
+        if input_format == "single_string":
+            return [texts]  # Wrap single string for batch processing
+        elif input_format == "cross_encoder_pairs":
+            return texts  # Already in correct format: [["query", "doc"]]
+        else:  # batch_strings
+            return texts  # Already in correct format: ["text1", "text2"]
+    def _extract_tokenizer_results(
+        self,
+        input_ids: List[List[int]],
+        token_type_ids: Optional[List[List[int]]],
+        input_format: str,
+        original_batch_size: int,
+    ) -> Union[
+        Tuple[List[int], Optional[List[int]]],
+        Tuple[List[List[int]], Optional[List[List[int]]]],
+    ]:
+        """Extract results from tokenizer output based on input format."""
+        # For single inputs (string or single cross-encoder pair), extract first element
+        if (
+            input_format in ["single_string", "cross_encoder_pairs"]
+            and original_batch_size == 1
+        ):
+            single_input_ids = input_ids[0] if input_ids else []
+            single_token_type_ids = token_type_ids[0] if token_type_ids else None
+            return single_input_ids, single_token_type_ids
+        # For true batches, return as-is
+        return input_ids, token_type_ids
+    async def _tokenize_texts(
+        self, texts: Union[str, List[str]], is_cross_encoder: bool = False
+    ) -> Union[
+        Tuple[List[int], Optional[List[int]]],
+        Tuple[List[List[int]], Optional[List[List[int]]]],
+    ]:
+        """
+        Tokenize text(s) using the appropriate tokenizer strategy.
+        This method handles multiple input formats and chooses between async dynamic
+        batch tokenizer (for single texts only) and regular tokenizer.
+        Args:
+            texts: Text input in various formats:
+                   Regular cases:
+                   - Single string: "How are you?"
+                   - Batch of strings: ["Hello", "World", "How are you?"]
+                   Cross-encoder cases (sentence pairs for similarity/ranking):
+                   - Single pair: [["query text", "document text"]]
+                   - Multiple pairs: [["q1", "d1"], ["q2", "d2"], ["q3", "d3"]]
+            is_cross_encoder: Whether to return token_type_ids for cross-encoder models.
+                             Enables proper handling of sentence pairs with segment IDs.
+        Returns:
+            Single input cases:
+                Tuple[List[int], Optional[List[int]]]: (input_ids, token_type_ids)
+                Example: ([101, 2129, 102], [0, 0, 0]) for single text
+                Example: ([101, 2129, 102, 4068, 102], [0, 0, 0, 1, 1]) for cross-encoder pair
+            Batch input cases:
+                Tuple[List[List[int]], Optional[List[List[int]]]]: (batch_input_ids, batch_token_type_ids)
+                Example: ([[101, 2129, 102], [101, 4068, 102]], None) for regular batch
+            Note: token_type_ids is None unless is_cross_encoder=True.
+        """
+        if not texts or self.tokenizer is None:
+            raise ValueError("texts cannot be empty and tokenizer must be initialized")
+        # Step 1: Detect input format and prepare for tokenization
+        input_format = self._detect_input_format(texts, is_cross_encoder)
+        tokenizer_input = self._prepare_tokenizer_input(texts, input_format)
+        original_batch_size = len(texts) if not isinstance(texts, str) else 1
+        # Step 2: Set up tokenizer arguments
+        tokenizer_kwargs = (
+            {"return_token_type_ids": is_cross_encoder} if is_cross_encoder else {}
+        )
+        # Step 3: Choose tokenization strategy
+        use_async_tokenizer = (
+            self.async_dynamic_batch_tokenizer is not None
+            and input_format == "single_string"
+        )
+        if use_async_tokenizer:
+            logger.debug("Using async dynamic batch tokenizer for single text")
+            result = await self.async_dynamic_batch_tokenizer.encode(
+                tokenizer_input[0], **tokenizer_kwargs
+            )
+            # Convert to batch format for consistency
+            input_ids = [result["input_ids"]]
+            token_type_ids = (
+                [result["token_type_ids"]]
+                if is_cross_encoder and result.get("token_type_ids")
+                else None
+            )
+        else:
+            logger.debug(f"Using regular tokenizer for {len(tokenizer_input)} inputs")
+            encoded = self.tokenizer(tokenizer_input, **tokenizer_kwargs)
+            input_ids = encoded["input_ids"]
+            token_type_ids = encoded.get("token_type_ids") if is_cross_encoder else None
+        # Step 4: Extract results based on input format
+        return self._extract_tokenizer_results(
+            input_ids, token_type_ids, input_format, original_batch_size
+        )
     async def _tokenize_one_request(
         self,
         obj: Union[GenerateReqInput, EmbeddingReqInput],
@@ -572,14 +582,10 @@ class TokenizerManager:
                     "accept text prompts. Please provide input_ids or re-initialize "
                     "the engine with skip_tokenizer_init=False."
                 )
-            encoded = self.tokenizer(
-                input_text, return_token_type_ids=is_cross_encoder_request
-            )
-            input_ids = encoded["input_ids"]
-            if is_cross_encoder_request:
-                input_ids = encoded["input_ids"][0]
-                token_type_ids = encoded.get("token_type_ids", [None])[0]
+            input_ids, token_type_ids = await self._tokenize_texts(
+                input_text, is_cross_encoder_request
+            )
         if self.mm_processor and obj.contains_mm_input():
             if not isinstance(obj.image_data, list):
@@ -599,6 +605,7 @@ class TokenizerManager:
             mm_inputs = None
         self._validate_one_request(obj, input_ids)
+        trace_slice_end("tokenize", obj.rid)
         return self._create_tokenized_object(
             obj, input_text, input_ids, input_embeds, mm_inputs, token_type_ids
         )
@@ -673,7 +680,7 @@ class TokenizerManager:
             ):
                 raise ValueError(
                     "The server is not configured to enable custom logit processor. "
-                    "Please set `--enable-custom-logits-processor` to enable this feature."
+                    "Please set `--enable-custom-logit-processor` to enable this feature."
                 )
     def _validate_input_ids_in_vocab(
@@ -754,19 +761,30 @@ class TokenizerManager:
         requests = [obj[i] for i in range(batch_size)]
         texts = [req.text for req in requests]
-        # Batch tokenize all texts
-        encoded = self.tokenizer(texts)
-        input_ids_list = encoded["input_ids"]
+        # Check if any request is a cross-encoder request
+        is_cross_encoder_request = any(
+            isinstance(req, EmbeddingReqInput) and req.is_cross_encoder_request
+            for req in requests
+        )
+        # Batch tokenize all texts using unified method
+        input_ids_list, token_type_ids_list = await self._tokenize_texts(
+            texts, is_cross_encoder_request
+        )
         # Process all requests
         tokenized_objs = []
         for i, req in enumerate(requests):
             self._validate_one_request(obj[i], input_ids_list[i])
+            token_type_ids = (
+                token_type_ids_list[i] if token_type_ids_list is not None else None
+            )
             tokenized_objs.append(
                 self._create_tokenized_object(
-                    req, req.text, input_ids_list[i], None, None
+                    req, req.text, input_ids_list[i], None, None, token_type_ids
                 )
             )
+            trace_slice_end("tokenize", req.rid)
         logger.debug(f"Completed batch processing for {batch_size} requests")
         return tokenized_objs
@@ -794,9 +812,12 @@ class TokenizerManager:
         tokenized_obj: Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput],
         created_time: Optional[float] = None,
     ):
+        trace_slice_start("dispatch", obj.rid)
+        tokenized_obj.trace_context = trace_get_proc_propagate_context(obj.rid)
         self.send_to_scheduler.send_pyobj(tokenized_obj)
         state = ReqState([], False, asyncio.Event(), obj, created_time=created_time)
         self.rid_to_state[obj.rid] = state
+        trace_slice_end("dispatch", obj.rid, thread_finish_flag=True)
         return state
     def _send_batch_request(
@@ -1014,74 +1035,14 @@ class TokenizerManager:
                     except StopAsyncIteration:
                         pass
-    async def flush_cache(self) -> FlushCacheReqOutput:
-        return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
-    async def clear_hicache_storage(self) -> ClearHiCacheReqOutput:
-        """Clear the hierarchical cache storage."""
-        # Delegate to the scheduler to handle HiCacheStorage clearing
-        return (await self.clear_hicache_storage_communicator(ClearHiCacheReqInput()))[
-            0
-        ]
     def abort_request(self, rid: str = "", abort_all: bool = False):
         if not abort_all and rid not in self.rid_to_state:
             return
         req = AbortReq(rid, abort_all)
         self.send_to_scheduler.send_pyobj(req)
         if self.enable_metrics:
             self.metrics_collector.observe_one_aborted_request()
-    async def start_profile(
-        self,
-        output_dir: Optional[str] = None,
-        start_step: Optional[int] = None,
-        num_steps: Optional[int] = None,
-        activities: Optional[List[str]] = None,
-        with_stack: Optional[bool] = None,
-        record_shapes: Optional[bool] = None,
-        profile_by_stage: bool = False,
-    ):
-        self.auto_create_handle_loop()
-        env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
-        with_stack = False if with_stack is False or env_with_stack is False else True
-        req = ProfileReq(
-            type=ProfileReqType.START_PROFILE,
-            output_dir=output_dir,
-            start_step=start_step,
-            num_steps=num_steps,
-            activities=activities,
-            with_stack=with_stack,
-            record_shapes=record_shapes,
-            profile_by_stage=profile_by_stage,
-            profile_id=str(time.time()),
-        )
-        return await self._execute_profile(req)
-    async def stop_profile(self):
-        self.auto_create_handle_loop()
-        req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
-        return await self._execute_profile(req)
-    async def _execute_profile(self, req: ProfileReq):
-        result = (await self.profile_communicator(req))[0]
-        if not result.success:
-            raise RuntimeError(result.message)
-        return result
-    async def start_expert_distribution_record(self):
-        self.auto_create_handle_loop()
-        await self.expert_distribution_communicator(ExpertDistributionReq.START_RECORD)
-    async def stop_expert_distribution_record(self):
-        self.auto_create_handle_loop()
-        await self.expert_distribution_communicator(ExpertDistributionReq.STOP_RECORD)
-    async def dump_expert_distribution_record(self):
-        self.auto_create_handle_loop()
-        await self.expert_distribution_communicator(ExpertDistributionReq.DUMP_RECORD)
     async def pause_generation(self):
         async with self.is_pause_cond:
             self.is_pause = True
@@ -1117,7 +1078,7 @@ class TokenizerManager:
         self, obj: UpdateWeightFromDiskReqInput
     ) -> Tuple[bool, str]:
         if self.server_args.tokenizer_worker_num > 1:
-            obj = MultiTokenizerWarpper(self.worker_id, obj)
+            obj = MultiTokenizerWrapper(self.worker_id, obj)
         self.send_to_scheduler.send_pyobj(obj)
         self.model_update_result = asyncio.Future()
         if self.server_args.dp_size == 1:
@@ -1142,191 +1103,6 @@ class TokenizerManager:
             all_paused_requests = [r.num_paused_requests for r in result]
             return all_success, all_message, all_paused_requests
-    async def init_weights_update_group(
-        self,
-        obj: InitWeightsUpdateGroupReqInput,
-        request: Optional[fastapi.Request] = None,
-    ) -> Tuple[bool, str]:
-        self.auto_create_handle_loop()
-        assert (
-            self.server_args.dp_size == 1
-        ), "dp_size must be 1 for init parameter update group"
-        result = (await self.init_weights_update_group_communicator(obj))[0]
-        return result.success, result.message
-    async def update_weights_from_distributed(
-        self,
-        obj: UpdateWeightsFromDistributedReqInput,
-        request: Optional[fastapi.Request] = None,
-    ) -> Tuple[bool, str]:
-        self.auto_create_handle_loop()
-        assert (
-            self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
-        ), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
-        if obj.abort_all_requests:
-            self.abort_request(abort_all=True)
-        # This means that weight sync
-        # cannot run while requests are in progress.
-        async with self.model_update_lock.writer_lock:
-            result = (await self.update_weights_from_distributed_communicator(obj))[0]
-            return result.success, result.message
-    async def update_weights_from_tensor(
-        self,
-        obj: UpdateWeightsFromTensorReqInput,
-        request: Optional[fastapi.Request] = None,
-    ) -> Tuple[bool, str]:
-        self.auto_create_handle_loop()
-        assert (
-            self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
-        ), "dp_size must be 1 or dp attention must be enabled for update weights from tensor"
-        if obj.abort_all_requests:
-            self.abort_request(abort_all=True)
-        # This means that weight sync
-        # cannot run while requests are in progress.
-        async with self.model_update_lock.writer_lock:
-            result = (await self.update_weights_from_tensor_communicator(obj))[0]
-            return result.success, result.message
-    async def load_lora_adapter(
-        self,
-        obj: LoadLoRAAdapterReqInput,
-        _: Optional[fastapi.Request] = None,
-    ) -> LoadLoRAAdapterReqOutput:
-        self.auto_create_handle_loop()
-        try:
-            if not self.server_args.enable_lora:
-                raise ValueError(
-                    "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
-                )
-            # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
-            # with dp_size > 1.
-            assert (
-                self.server_args.dp_size == 1
-            ), "dp_size must be 1 for dynamic lora loading"
-            logger.info(
-                "Start load Lora adapter. Lora name=%s, path=%s",
-                obj.lora_name,
-                obj.lora_path,
-            )
-            async with self.lora_update_lock:
-                if (
-                    self.server_args.max_loaded_loras is not None
-                    and self.lora_registry.num_registered_loras
-                    >= self.server_args.max_loaded_loras
-                ):
-                    raise ValueError(
-                        f"Cannot load LoRA adapter {obj.lora_name} at path {obj.lora_path}. "
-                        f"Maximum number of loaded LoRA adapters is {self.server_args.max_loaded_loras}. "
-                        "Please unload some LoRA adapters before loading new ones."
-                    )
-                # Generate new uniquely identifiable LoRARef object.
-                new_adapter = LoRARef(
-                    lora_name=obj.lora_name,
-                    lora_path=obj.lora_path,
-                    pinned=obj.pinned,
-                )
-                # Trigger the actual loading operation at the backend processes.
-                obj.lora_id = new_adapter.lora_id
-                result = (await self.update_lora_adapter_communicator(obj))[0]
-                # Register the LoRA adapter only after loading is successful.
-                if result.success:
-                    await self.lora_registry.register(new_adapter)
-                return result
-        except ValueError as e:
-            return LoadLoRAAdapterReqOutput(
-                success=False,
-                error_message=str(e),
-            )
-    async def unload_lora_adapter(
-        self,
-        obj: UnloadLoRAAdapterReqInput,
-        _: Optional[fastapi.Request] = None,
-    ) -> UnloadLoRAAdapterReqOutput:
-        self.auto_create_handle_loop()
-        try:
-            if not self.server_args.enable_lora:
-                raise ValueError(
-                    "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
-                )
-            assert (
-                obj.lora_name is not None
-            ), "lora_name must be provided to unload LoRA adapter"
-            # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
-            # with dp_size > 1.
-            assert (
-                self.server_args.dp_size == 1
-            ), "dp_size must be 1 for dynamic lora loading"
-            logger.info(
-                "Start unload Lora adapter. Lora name=%s",
-                obj.lora_name,
-            )
-            async with self.lora_update_lock:
-                # Unregister the LoRA adapter from the registry to stop new requests for this adapter
-                # from being started.
-                lora_id = await self.lora_registry.unregister(obj.lora_name)
-                obj.lora_id = lora_id
-                # Initiate the actual unloading operation at the backend processes only after all
-                # ongoing requests using this LoRA adapter are finished.
-                await self.lora_registry.wait_for_unload(lora_id)
-                result = (await self.update_lora_adapter_communicator(obj))[0]
-                return result
-        except ValueError as e:
-            return UnloadLoRAAdapterReqOutput(success=False, error_message=str(e))
-    async def get_weights_by_name(
-        self, obj: GetWeightsByNameReqInput, request: Optional[fastapi.Request] = None
-    ):
-        self.auto_create_handle_loop()
-        results = await self.get_weights_by_name_communicator(obj)
-        all_parameters = [r.parameter for r in results]
-        if self.server_args.dp_size == 1:
-            return all_parameters[0]
-        else:
-            return all_parameters
-    async def release_memory_occupation(
-        self,
-        obj: ReleaseMemoryOccupationReqInput,
-        request: Optional[fastapi.Request] = None,
-    ):
-        self.auto_create_handle_loop()
-        await self.release_memory_occupation_communicator(obj)
-    async def resume_memory_occupation(
-        self,
-        obj: ResumeMemoryOccupationReqInput,
-        request: Optional[fastapi.Request] = None,
-    ):
-        self.auto_create_handle_loop()
-        await self.resume_memory_occupation_communicator(obj)
-    async def slow_down(
-        self,
-        obj: SlowDownReqInput,
-        request: Optional[fastapi.Request] = None,
-    ):
-        self.auto_create_handle_loop()
-        await self.slow_down_communicator(obj)
     async def open_session(
         self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
     ):
@@ -1338,7 +1114,7 @@ class TokenizerManager:
             return None
         if self.server_args.tokenizer_worker_num > 1:
-            obj = MultiTokenizerWarpper(self.worker_id, obj)
+            obj = MultiTokenizerWrapper(self.worker_id, obj)
         self.send_to_scheduler.send_pyobj(obj)
         self.session_futures[obj.session_id] = asyncio.Future()
@@ -1351,28 +1127,6 @@ class TokenizerManager:
     ):
         await self.send_to_scheduler.send_pyobj(obj)
-    async def get_internal_state(self) -> List[Dict[Any, Any]]:
-        req = GetInternalStateReq()
-        responses: List[GetInternalStateReqOutput] = (
-            await self.get_internal_state_communicator(req)
-        )
-        # Many DP ranks
-        return [res.internal_state for res in responses]
-    async def set_internal_state(self, obj: SetInternalStateReq) -> List[bool]:
-        responses: List[SetInternalStateReqOutput] = (
-            await self.set_internal_state_communicator(obj)
-        )
-        return [res.updated for res in responses]
-    async def get_load(self) -> dict:
-        # TODO(lsyin): fake load report server
-        if not self.current_load_lock.locked():
-            async with self.current_load_lock:
-                internal_state = await self.get_internal_state()
-                self.current_load = internal_state[0]["load"]
-        return {"load": self.current_load}
     def get_log_request_metadata(self):
         max_length = None
         skip_names = None
@@ -1491,6 +1245,9 @@ class TokenizerManager:
         self.asyncio_tasks.add(
             loop.create_task(print_exception_wrapper(self.sigterm_watchdog))
         )
+        self.asyncio_tasks.add(
+            loop.create_task(print_exception_wrapper(self.watch_load_thread))
+        )
     def dump_requests_before_crash(self):
         if self.crash_dump_performed:
@@ -1710,6 +1467,9 @@ class TokenizerManager:
                     meta_info["spec_verify_ct"] = recv_obj.spec_verify_ct[i]
                 state.finished_time = time.time()
                 meta_info["e2e_latency"] = state.finished_time - state.created_time
+                trace_req_finish(rid, ts=int(state.finished_time * 1e9))
                 del self.rid_to_state[rid]
                 # Mark ongoing LoRA request as finished.
@@ -1859,6 +1619,12 @@ class TokenizerManager:
             else 0
         )
+        customer_labels = getattr(state.obj, "customer_labels", None)
+        labels = (
+            {**self.metrics_collector.labels, **customer_labels}
+            if customer_labels
+            else self.metrics_collector.labels
+        )
         if (
             state.first_token_time == 0.0
             and self.disaggregation_mode != DisaggregationMode.PREFILL
@@ -1866,7 +1632,7 @@ class TokenizerManager:
             state.first_token_time = state.last_time = time.time()
             state.last_completion_tokens = completion_tokens
             self.metrics_collector.observe_time_to_first_token(
-                state.first_token_time - state.created_time
+                labels, state.first_token_time - state.created_time
             )
         else:
             num_new_tokens = completion_tokens - state.last_completion_tokens
@@ -1874,6 +1640,7 @@ class TokenizerManager:
                 new_time = time.time()
                 interval = new_time - state.last_time
                 self.metrics_collector.observe_inter_token_latency(
+                    labels,
                     interval,
                     num_new_tokens,
                 )
@@ -1888,6 +1655,7 @@ class TokenizerManager:
                 or state.obj.sampling_params.get("structural_tag", None)
             )
             self.metrics_collector.observe_one_finished_request(
+                labels,
                 recv_obj.prompt_tokens[i],
                 completion_tokens,
                 recv_obj.cached_tokens[i],
@@ -2059,11 +1827,15 @@ class TokenizerManager:
             # the next position after the last token in the prompt
             output_logprobs = result["meta_info"].get("output_token_ids_logprobs", [])
-            # Throw an error here if output_logprobs is None
-            if output_logprobs is None:
+            # Check if output_logprobs is properly populated
+            if (
+                output_logprobs is None
+                or not output_logprobs
+                or len(output_logprobs) == 0
+            ):
                 raise RuntimeError(
-                    f"output_logprobs is None for request {result['meta_info'].get('id', '<unknown>')}. "
-                    "This usually indicates a problem with the scoring request or the backend output."
+                    f"output_logprobs is empty for request {result['meta_info'].get('id', '<unknown>')}. "
+                    "This indicates token_ids_logprobs were not computed properly for the scoring request."
                 )
             for logprob, token_id, _ in output_logprobs[0]:
@@ -2088,6 +1860,20 @@ class TokenizerManager:
         return scores
+    async def watch_load_thread(self):
+        # Only for dp_controller when dp_size > 1
+        if (
+            self.server_args.dp_size == 1
+            or self.server_args.load_balance_method == "round_robin"
+        ):
+            return
+        while True:
+            await asyncio.sleep(self.server_args.load_watch_interval)
+            loads = await self.get_load_communicator(GetLoadReqInput())
+            load_udpate_req = WatchLoadUpdateReq(loads=loads)
+            self.send_to_scheduler.send_pyobj(load_udpate_req)
 class ServerStatus(Enum):
     Up = "Up"
@@ -2139,51 +1925,6 @@ class SignalHandler:
         kill_process_tree(os.getpid())
-T = TypeVar("T")
-class _Communicator(Generic[T]):
-    """Note: The communicator now only run up to 1 in-flight request at any time."""
-    enable_multi_tokenizer = False
-    def __init__(self, sender, fan_out: int):
-        self._sender = sender
-        self._fan_out = fan_out
-        self._result_event: Optional[asyncio.Event] = None
-        self._result_values: Optional[List[T]] = None
-        self._ready_queue: Deque[asyncio.Future] = deque()
-    async def __call__(self, obj):
-        ready_event = asyncio.Event()
-        if self._result_event is not None or len(self._ready_queue) > 0:
-            self._ready_queue.append(ready_event)
-            await ready_event.wait()
-            assert self._result_event is None
-            assert self._result_values is None
-        if obj:
-            if _Communicator.enable_multi_tokenizer:
-                obj = MultiTokenizerWarpper(worker_id=os.getpid(), obj=obj)
-            self._sender.send_pyobj(obj)
-        self._result_event = asyncio.Event()
-        self._result_values = []
-        await self._result_event.wait()
-        result_values = self._result_values
-        self._result_event = self._result_values = None
-        if len(self._ready_queue) > 0:
-            self._ready_queue.popleft().set()
-        return result_values
-    def handle_recv(self, recv_obj: T):
-        self._result_values.append(recv_obj)
-        if len(self._result_values) == self._fan_out:
-            self._result_event.set()
 # Note: request abort handling logic
 # We should handle all of the following cases correctly.
 #

sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

sglang 0.5.2rc1py3-none-any.whl → 0.5.3rc0py3-none-any.whl