sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +321 -31
- sglang/bench_serving.py +10 -3
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +8 -0
- sglang/srt/configs/model_config.py +160 -105
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/constrained/base_grammar_backend.py +1 -0
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +6 -4
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/common/conn.py +266 -98
- sglang/srt/disaggregation/decode.py +50 -9
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/mooncake/conn.py +51 -541
- sglang/srt/disaggregation/nixl/conn.py +148 -39
- sglang/srt/disaggregation/prefill.py +31 -14
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +135 -80
- sglang/srt/entrypoints/engine.py +23 -3
- sglang/srt/entrypoints/grpc_request_manager.py +330 -55
- sglang/srt/entrypoints/grpc_server.py +232 -102
- sglang/srt/entrypoints/http_server.py +49 -9
- sglang/srt/entrypoints/openai/protocol.py +110 -5
- sglang/srt/entrypoints/openai/serving_base.py +25 -6
- sglang/srt/entrypoints/openai/serving_chat.py +178 -49
- sglang/srt/entrypoints/openai/serving_completions.py +5 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +42 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/function_call/function_call_parser.py +3 -2
- sglang/srt/function_call/glm4_moe_detector.py +3 -3
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
- sglang/srt/layers/activation.py +7 -6
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +108 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +112 -194
- sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
- sglang/srt/layers/attention/mamba/mamba.py +566 -1
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +42 -9
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +11 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +2 -0
- sglang/srt/layers/linear.py +21 -4
- sglang/srt/layers/logits_processor.py +15 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +147 -74
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
- sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
- sglang/srt/layers/moe/utils.py +10 -0
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/fp8.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +1 -1
- sglang/srt/layers/quantization/modelopt_quant.py +44 -9
- sglang/srt/layers/quantization/mxfp4.py +12 -4
- sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
- sglang/srt/layers/quantization/w4afp8.py +0 -4
- sglang/srt/layers/quantization/w8a8_int8.py +15 -3
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +52 -4
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +10 -4
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +17 -6
- sglang/srt/lora/mem_pool.py +1 -1
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +42 -142
- sglang/srt/managers/data_parallel_controller.py +11 -46
- sglang/srt/managers/detokenizer_manager.py +11 -11
- sglang/srt/managers/io_struct.py +162 -118
- sglang/srt/managers/mm_utils.py +43 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +167 -86
- sglang/srt/managers/schedule_policy.py +143 -16
- sglang/srt/managers/scheduler.py +359 -214
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
- sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
- sglang/srt/managers/tokenizer_manager.py +84 -136
- sglang/srt/managers/tp_worker.py +39 -29
- sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +40 -1
- sglang/srt/mem_cache/hiradix_cache.py +119 -32
- sglang/srt/mem_cache/memory_pool.py +188 -10
- sglang/srt/mem_cache/memory_pool_host.py +134 -182
- sglang/srt/mem_cache/radix_cache.py +222 -71
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
- sglang/srt/mem_cache/swa_radix_cache.py +25 -34
- sglang/srt/metrics/collector.py +82 -120
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +39 -32
- sglang/srt/model_executor/forward_batch_info.py +23 -38
- sglang/srt/model_executor/model_runner.py +131 -183
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/loader.py +14 -10
- sglang/srt/model_loader/weight_utils.py +156 -2
- sglang/srt/models/bailing_moe.py +27 -4
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +536 -153
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +3 -3
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +1 -1
- sglang/srt/models/glm4v_moe.py +1 -1
- sglang/srt/models/gpt_oss.py +7 -30
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/longcat_flash.py +1 -1
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +15 -4
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +2 -2
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +64 -1
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +31 -3
- sglang/srt/models/qwen3_next.py +36 -9
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +2 -3
- sglang/srt/multimodal/processors/internvl.py +20 -8
- sglang/srt/multimodal/processors/qwen_vl.py +8 -1
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +20 -2
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +753 -295
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
- sglang/srt/speculative/eagle_worker.py +57 -25
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +47 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +399 -74
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +1 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +12 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +355 -4
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ import copy
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import time
|
8
|
+
import uuid
|
8
9
|
from collections import deque
|
9
10
|
from typing import (
|
10
11
|
TYPE_CHECKING,
|
@@ -24,8 +25,12 @@ import zmq
|
|
24
25
|
from sglang.srt.managers.io_struct import (
|
25
26
|
ClearHiCacheReqInput,
|
26
27
|
ClearHiCacheReqOutput,
|
28
|
+
CloseSessionReqInput,
|
29
|
+
DestroyWeightsUpdateGroupReqInput,
|
30
|
+
DestroyWeightsUpdateGroupReqOutput,
|
27
31
|
ExpertDistributionReq,
|
28
32
|
ExpertDistributionReqOutput,
|
33
|
+
ExpertDistributionReqType,
|
29
34
|
FlushCacheReqInput,
|
30
35
|
FlushCacheReqOutput,
|
31
36
|
GetInternalStateReq,
|
@@ -40,8 +45,9 @@ from sglang.srt.managers.io_struct import (
|
|
40
45
|
InitWeightsUpdateGroupReqOutput,
|
41
46
|
LoadLoRAAdapterReqInput,
|
42
47
|
LoadLoRAAdapterReqOutput,
|
43
|
-
|
48
|
+
LoRAUpdateOutput,
|
44
49
|
MultiTokenizerWrapper,
|
50
|
+
OpenSessionReqInput,
|
45
51
|
ProfileReq,
|
46
52
|
ProfileReqOutput,
|
47
53
|
ProfileReqType,
|
@@ -149,6 +155,9 @@ class TokenizerCommunicatorMixin:
|
|
149
155
|
self.init_weights_update_group_communicator = _Communicator(
|
150
156
|
self.send_to_scheduler, server_args.dp_size
|
151
157
|
)
|
158
|
+
self.destroy_weights_update_group_communicator = _Communicator(
|
159
|
+
self.send_to_scheduler, server_args.dp_size
|
160
|
+
)
|
152
161
|
self.update_weights_from_distributed_communicator = _Communicator(
|
153
162
|
self.send_to_scheduler, server_args.dp_size
|
154
163
|
)
|
@@ -207,6 +216,10 @@ class TokenizerCommunicatorMixin:
|
|
207
216
|
InitWeightsUpdateGroupReqOutput,
|
208
217
|
self.init_weights_update_group_communicator.handle_recv,
|
209
218
|
),
|
219
|
+
(
|
220
|
+
DestroyWeightsUpdateGroupReqOutput,
|
221
|
+
self.destroy_weights_update_group_communicator.handle_recv,
|
222
|
+
),
|
210
223
|
(
|
211
224
|
UpdateWeightsFromDistributedReqOutput,
|
212
225
|
self.update_weights_from_distributed_communicator.handle_recv,
|
@@ -264,7 +277,7 @@ class TokenizerCommunicatorMixin:
|
|
264
277
|
self.expert_distribution_communicator.handle_recv,
|
265
278
|
),
|
266
279
|
(
|
267
|
-
|
280
|
+
LoRAUpdateOutput,
|
268
281
|
self.update_lora_adapter_communicator.handle_recv,
|
269
282
|
),
|
270
283
|
(
|
@@ -323,15 +336,18 @@ class TokenizerCommunicatorMixin:
|
|
323
336
|
|
324
337
|
async def start_expert_distribution_record(self: TokenizerManager):
|
325
338
|
self.auto_create_handle_loop()
|
326
|
-
|
339
|
+
req = ExpertDistributionReq(action=ExpertDistributionReqType.START_RECORD)
|
340
|
+
await self.expert_distribution_communicator(req)
|
327
341
|
|
328
342
|
async def stop_expert_distribution_record(self: TokenizerManager):
|
329
343
|
self.auto_create_handle_loop()
|
330
|
-
|
344
|
+
req = ExpertDistributionReq(action=ExpertDistributionReqType.STOP_RECORD)
|
345
|
+
await self.expert_distribution_communicator(req)
|
331
346
|
|
332
347
|
async def dump_expert_distribution_record(self: TokenizerManager):
|
333
348
|
self.auto_create_handle_loop()
|
334
|
-
|
349
|
+
req = ExpertDistributionReq(action=ExpertDistributionReqType.DUMP_RECORD)
|
350
|
+
await self.expert_distribution_communicator(req)
|
335
351
|
|
336
352
|
async def init_weights_update_group(
|
337
353
|
self: TokenizerManager,
|
@@ -345,6 +361,18 @@ class TokenizerCommunicatorMixin:
|
|
345
361
|
result = (await self.init_weights_update_group_communicator(obj))[0]
|
346
362
|
return result.success, result.message
|
347
363
|
|
364
|
+
async def destroy_weights_update_group(
|
365
|
+
self,
|
366
|
+
obj: DestroyWeightsUpdateGroupReqInput,
|
367
|
+
request: Optional[fastapi.Request] = None,
|
368
|
+
) -> Tuple[bool, str]:
|
369
|
+
self.auto_create_handle_loop()
|
370
|
+
assert (
|
371
|
+
self.server_args.dp_size == 1
|
372
|
+
), "dp_size must be 1 for destroy parameter update group"
|
373
|
+
result = (await self.destroy_weights_update_group_communicator(obj))[0]
|
374
|
+
return result.success, result.message
|
375
|
+
|
348
376
|
async def update_weights_from_distributed(
|
349
377
|
self: TokenizerManager,
|
350
378
|
obj: UpdateWeightsFromDistributedReqInput,
|
@@ -567,3 +595,81 @@ class TokenizerCommunicatorMixin:
|
|
567
595
|
async def get_load(self: TokenizerManager) -> List[GetLoadReqOutput]:
|
568
596
|
req = GetLoadReqInput()
|
569
597
|
return await self.get_load_communicator(req)
|
598
|
+
|
599
|
+
async def open_session(
|
600
|
+
self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
|
601
|
+
):
|
602
|
+
self.auto_create_handle_loop()
|
603
|
+
|
604
|
+
if obj.session_id is None:
|
605
|
+
obj.session_id = uuid.uuid4().hex
|
606
|
+
elif obj.session_id in self.session_futures:
|
607
|
+
return None
|
608
|
+
|
609
|
+
if self.server_args.tokenizer_worker_num > 1:
|
610
|
+
obj = MultiTokenizerWrapper(self.worker_id, obj)
|
611
|
+
self.send_to_scheduler.send_pyobj(obj)
|
612
|
+
|
613
|
+
self.session_futures[obj.session_id] = asyncio.Future()
|
614
|
+
session_id = await self.session_futures[obj.session_id]
|
615
|
+
del self.session_futures[obj.session_id]
|
616
|
+
return session_id
|
617
|
+
|
618
|
+
async def close_session(
|
619
|
+
self, obj: CloseSessionReqInput, request: Optional[fastapi.Request] = None
|
620
|
+
):
|
621
|
+
await self.send_to_scheduler.send_pyobj(obj)
|
622
|
+
|
623
|
+
def get_log_request_metadata(self):
|
624
|
+
max_length = None
|
625
|
+
skip_names = None
|
626
|
+
out_skip_names = None
|
627
|
+
if self.log_requests:
|
628
|
+
if self.log_requests_level == 0:
|
629
|
+
max_length = 1 << 30
|
630
|
+
skip_names = set(
|
631
|
+
[
|
632
|
+
"text",
|
633
|
+
"input_ids",
|
634
|
+
"input_embeds",
|
635
|
+
"image_data",
|
636
|
+
"audio_data",
|
637
|
+
"lora_path",
|
638
|
+
"sampling_params",
|
639
|
+
]
|
640
|
+
)
|
641
|
+
out_skip_names = set(
|
642
|
+
[
|
643
|
+
"text",
|
644
|
+
"output_ids",
|
645
|
+
"embedding",
|
646
|
+
]
|
647
|
+
)
|
648
|
+
elif self.log_requests_level == 1:
|
649
|
+
max_length = 1 << 30
|
650
|
+
skip_names = set(
|
651
|
+
[
|
652
|
+
"text",
|
653
|
+
"input_ids",
|
654
|
+
"input_embeds",
|
655
|
+
"image_data",
|
656
|
+
"audio_data",
|
657
|
+
"lora_path",
|
658
|
+
]
|
659
|
+
)
|
660
|
+
out_skip_names = set(
|
661
|
+
[
|
662
|
+
"text",
|
663
|
+
"output_ids",
|
664
|
+
"embedding",
|
665
|
+
]
|
666
|
+
)
|
667
|
+
elif self.log_requests_level == 2:
|
668
|
+
max_length = 2048
|
669
|
+
elif self.log_requests_level == 3:
|
670
|
+
max_length = 1 << 30
|
671
|
+
else:
|
672
|
+
raise ValueError(
|
673
|
+
f"Invalid --log-requests-level: {self.log_requests_level=}"
|
674
|
+
)
|
675
|
+
return max_length, skip_names, out_skip_names
|
@@ -43,23 +43,17 @@ from fastapi import BackgroundTasks
|
|
43
43
|
from sglang.srt.aio_rwlock import RWLock
|
44
44
|
from sglang.srt.configs.model_config import ModelConfig
|
45
45
|
from sglang.srt.disaggregation.utils import DisaggregationMode
|
46
|
-
from sglang.srt.
|
47
|
-
get_processor,
|
48
|
-
get_tokenizer,
|
49
|
-
get_tokenizer_from_processor,
|
50
|
-
)
|
51
|
-
from sglang.srt.lora.lora_registry import LoRARef, LoRARegistry
|
46
|
+
from sglang.srt.lora.lora_registry import LoRARegistry
|
52
47
|
from sglang.srt.managers.async_dynamic_batch_tokenizer import AsyncDynamicbatchTokenizer
|
53
48
|
from sglang.srt.managers.disagg_service import start_disagg_service
|
54
49
|
from sglang.srt.managers.io_struct import (
|
55
50
|
AbortReq,
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
51
|
+
BatchEmbeddingOutput,
|
52
|
+
BatchMultimodalOutput,
|
53
|
+
BatchStrOutput,
|
54
|
+
BatchTokenIDOutput,
|
60
55
|
BatchTokenizedEmbeddingReqInput,
|
61
56
|
BatchTokenizedGenerateReqInput,
|
62
|
-
CloseSessionReqInput,
|
63
57
|
ConfigureLoggingReq,
|
64
58
|
EmbeddingReqInput,
|
65
59
|
FreezeGCReq,
|
@@ -67,7 +61,6 @@ from sglang.srt.managers.io_struct import (
|
|
67
61
|
GetLoadReqInput,
|
68
62
|
HealthCheckOutput,
|
69
63
|
MultiTokenizerWrapper,
|
70
|
-
OpenSessionReqInput,
|
71
64
|
OpenSessionReqOutput,
|
72
65
|
SessionParams,
|
73
66
|
TokenizedEmbeddingReqInput,
|
@@ -84,6 +77,7 @@ from sglang.srt.managers.tokenizer_communicator_mixin import TokenizerCommunicat
|
|
84
77
|
from sglang.srt.metrics.collector import TokenizerMetricsCollector
|
85
78
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
86
79
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
80
|
+
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
87
81
|
from sglang.srt.tracing.trace import (
|
88
82
|
trace_get_proc_propagate_context,
|
89
83
|
trace_req_finish,
|
@@ -100,6 +94,11 @@ from sglang.srt.utils import (
|
|
100
94
|
get_zmq_socket,
|
101
95
|
kill_process_tree,
|
102
96
|
)
|
97
|
+
from sglang.srt.utils.hf_transformers_utils import (
|
98
|
+
get_processor,
|
99
|
+
get_tokenizer,
|
100
|
+
get_tokenizer_from_processor,
|
101
|
+
)
|
103
102
|
from sglang.utils import TypeBasedDispatcher, get_exception_traceback
|
104
103
|
|
105
104
|
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
@@ -163,6 +162,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
163
162
|
else None
|
164
163
|
)
|
165
164
|
self.crash_dump_folder = server_args.crash_dump_folder
|
165
|
+
self.enable_trace = server_args.enable_trace
|
166
166
|
|
167
167
|
# Read model args
|
168
168
|
self.model_path = server_args.model_path
|
@@ -174,8 +174,17 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
174
174
|
self.image_token_id = self.model_config.image_token_id
|
175
175
|
self.max_req_input_len = None # Will be set later in engine.py
|
176
176
|
|
177
|
+
speculative_algorithm = SpeculativeAlgorithm.from_string(
|
178
|
+
server_args.speculative_algorithm
|
179
|
+
)
|
180
|
+
self.reserve_input_token_num = (
|
181
|
+
0
|
182
|
+
if speculative_algorithm.is_none()
|
183
|
+
else server_args.speculative_num_draft_tokens
|
184
|
+
)
|
185
|
+
|
177
186
|
if self.model_config.is_multimodal:
|
178
|
-
import_processors()
|
187
|
+
import_processors("sglang.srt.multimodal.processors")
|
179
188
|
try:
|
180
189
|
_processor = get_processor(
|
181
190
|
server_args.tokenizer_path,
|
@@ -310,8 +319,8 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
310
319
|
"model_name": self.server_args.served_model_name,
|
311
320
|
# TODO: Add lora name/path in the future,
|
312
321
|
}
|
313
|
-
if server_args.
|
314
|
-
for label in server_args.
|
322
|
+
if server_args.tokenizer_metrics_allowed_custom_labels:
|
323
|
+
for label in server_args.tokenizer_metrics_allowed_custom_labels:
|
315
324
|
labels[label] = ""
|
316
325
|
self.metrics_collector = TokenizerMetricsCollector(
|
317
326
|
server_args=server_args,
|
@@ -330,10 +339,10 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
330
339
|
[
|
331
340
|
(
|
332
341
|
(
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
342
|
+
BatchStrOutput,
|
343
|
+
BatchEmbeddingOutput,
|
344
|
+
BatchTokenIDOutput,
|
345
|
+
BatchMultimodalOutput,
|
337
346
|
),
|
338
347
|
self._handle_batch_output,
|
339
348
|
),
|
@@ -371,23 +380,8 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
371
380
|
# If it's a single value, add worker_id prefix
|
372
381
|
obj.rid = f"{self.worker_id}_{obj.rid}"
|
373
382
|
|
374
|
-
if
|
375
|
-
|
376
|
-
obj.bootstrap_room if hasattr(obj, "bootstrap_room") else None
|
377
|
-
)
|
378
|
-
trace_req_start(obj.rid, bootstrap_room, ts=int(created_time * 1e9))
|
379
|
-
trace_slice_start("", obj.rid, ts=int(created_time * 1e9), anonymous=True)
|
380
|
-
else:
|
381
|
-
for i in range(len(obj.rid)):
|
382
|
-
bootstrap_room = (
|
383
|
-
obj.bootstrap_room[i]
|
384
|
-
if hasattr(obj, "bootstrap_room") and obj.bootstrap_room
|
385
|
-
else None
|
386
|
-
)
|
387
|
-
trace_req_start(obj.rid[i], bootstrap_room, ts=int(created_time * 1e9))
|
388
|
-
trace_slice_start(
|
389
|
-
"", obj.rid[i], ts=int(created_time * 1e9), anonymous=True
|
390
|
-
)
|
383
|
+
if self.enable_trace:
|
384
|
+
self._trace_request_start(obj, created_time)
|
391
385
|
|
392
386
|
if self.log_requests:
|
393
387
|
max_length, skip_names, _ = self.log_request_metadata
|
@@ -618,6 +612,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
618
612
|
_max_req_len = self.context_len
|
619
613
|
|
620
614
|
input_token_num = len(input_ids) if input_ids is not None else 0
|
615
|
+
input_token_num += self.reserve_input_token_num
|
621
616
|
if input_token_num >= self.context_len:
|
622
617
|
if self.server_args.allow_auto_truncate:
|
623
618
|
logger.warning(
|
@@ -719,7 +714,6 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
719
714
|
)
|
720
715
|
|
721
716
|
tokenized_obj = TokenizedGenerateReqInput(
|
722
|
-
obj.rid,
|
723
717
|
input_text,
|
724
718
|
input_ids,
|
725
719
|
mm_inputs,
|
@@ -729,6 +723,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
729
723
|
obj.top_logprobs_num,
|
730
724
|
obj.token_ids_logprob,
|
731
725
|
obj.stream,
|
726
|
+
rid=obj.rid,
|
732
727
|
bootstrap_host=obj.bootstrap_host,
|
733
728
|
bootstrap_port=obj.bootstrap_port,
|
734
729
|
bootstrap_room=obj.bootstrap_room,
|
@@ -738,15 +733,18 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
738
733
|
custom_logit_processor=obj.custom_logit_processor,
|
739
734
|
return_hidden_states=obj.return_hidden_states,
|
740
735
|
data_parallel_rank=obj.data_parallel_rank,
|
736
|
+
priority=obj.priority,
|
737
|
+
extra_key=obj.extra_key,
|
741
738
|
)
|
742
739
|
elif isinstance(obj, EmbeddingReqInput):
|
743
740
|
tokenized_obj = TokenizedEmbeddingReqInput(
|
744
|
-
obj.rid,
|
745
741
|
input_text,
|
746
742
|
input_ids,
|
747
743
|
mm_inputs,
|
748
744
|
token_type_ids,
|
749
745
|
sampling_params,
|
746
|
+
rid=obj.rid,
|
747
|
+
priority=obj.priority,
|
750
748
|
)
|
751
749
|
|
752
750
|
return tokenized_obj
|
@@ -1038,10 +1036,13 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1038
1036
|
def abort_request(self, rid: str = "", abort_all: bool = False):
|
1039
1037
|
if not abort_all and rid not in self.rid_to_state:
|
1040
1038
|
return
|
1041
|
-
req = AbortReq(rid, abort_all)
|
1039
|
+
req = AbortReq(rid=rid, abort_all=abort_all)
|
1042
1040
|
self.send_to_scheduler.send_pyobj(req)
|
1043
1041
|
if self.enable_metrics:
|
1044
|
-
|
1042
|
+
# TODO: also use custom_labels from the request
|
1043
|
+
self.metrics_collector.observe_one_aborted_request(
|
1044
|
+
self.metrics_collector.labels
|
1045
|
+
)
|
1045
1046
|
|
1046
1047
|
async def pause_generation(self):
|
1047
1048
|
async with self.is_pause_cond:
|
@@ -1103,84 +1104,6 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1103
1104
|
all_paused_requests = [r.num_paused_requests for r in result]
|
1104
1105
|
return all_success, all_message, all_paused_requests
|
1105
1106
|
|
1106
|
-
async def open_session(
|
1107
|
-
self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
|
1108
|
-
):
|
1109
|
-
self.auto_create_handle_loop()
|
1110
|
-
|
1111
|
-
if obj.session_id is None:
|
1112
|
-
obj.session_id = uuid.uuid4().hex
|
1113
|
-
elif obj.session_id in self.session_futures:
|
1114
|
-
return None
|
1115
|
-
|
1116
|
-
if self.server_args.tokenizer_worker_num > 1:
|
1117
|
-
obj = MultiTokenizerWrapper(self.worker_id, obj)
|
1118
|
-
self.send_to_scheduler.send_pyobj(obj)
|
1119
|
-
|
1120
|
-
self.session_futures[obj.session_id] = asyncio.Future()
|
1121
|
-
session_id = await self.session_futures[obj.session_id]
|
1122
|
-
del self.session_futures[obj.session_id]
|
1123
|
-
return session_id
|
1124
|
-
|
1125
|
-
async def close_session(
|
1126
|
-
self, obj: CloseSessionReqInput, request: Optional[fastapi.Request] = None
|
1127
|
-
):
|
1128
|
-
await self.send_to_scheduler.send_pyobj(obj)
|
1129
|
-
|
1130
|
-
def get_log_request_metadata(self):
|
1131
|
-
max_length = None
|
1132
|
-
skip_names = None
|
1133
|
-
out_skip_names = None
|
1134
|
-
if self.log_requests:
|
1135
|
-
if self.log_requests_level == 0:
|
1136
|
-
max_length = 1 << 30
|
1137
|
-
skip_names = set(
|
1138
|
-
[
|
1139
|
-
"text",
|
1140
|
-
"input_ids",
|
1141
|
-
"input_embeds",
|
1142
|
-
"image_data",
|
1143
|
-
"audio_data",
|
1144
|
-
"lora_path",
|
1145
|
-
"sampling_params",
|
1146
|
-
]
|
1147
|
-
)
|
1148
|
-
out_skip_names = set(
|
1149
|
-
[
|
1150
|
-
"text",
|
1151
|
-
"output_ids",
|
1152
|
-
"embedding",
|
1153
|
-
]
|
1154
|
-
)
|
1155
|
-
elif self.log_requests_level == 1:
|
1156
|
-
max_length = 1 << 30
|
1157
|
-
skip_names = set(
|
1158
|
-
[
|
1159
|
-
"text",
|
1160
|
-
"input_ids",
|
1161
|
-
"input_embeds",
|
1162
|
-
"image_data",
|
1163
|
-
"audio_data",
|
1164
|
-
"lora_path",
|
1165
|
-
]
|
1166
|
-
)
|
1167
|
-
out_skip_names = set(
|
1168
|
-
[
|
1169
|
-
"text",
|
1170
|
-
"output_ids",
|
1171
|
-
"embedding",
|
1172
|
-
]
|
1173
|
-
)
|
1174
|
-
elif self.log_requests_level == 2:
|
1175
|
-
max_length = 2048
|
1176
|
-
elif self.log_requests_level == 3:
|
1177
|
-
max_length = 1 << 30
|
1178
|
-
else:
|
1179
|
-
raise ValueError(
|
1180
|
-
f"Invalid --log-requests-level: {self.log_requests_level=}"
|
1181
|
-
)
|
1182
|
-
return max_length, skip_names, out_skip_names
|
1183
|
-
|
1184
1107
|
def configure_logging(self, obj: ConfigureLoggingReq):
|
1185
1108
|
if obj.log_requests is not None:
|
1186
1109
|
self.log_requests = obj.log_requests
|
@@ -1339,12 +1262,12 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1339
1262
|
# Drain requests
|
1340
1263
|
while True:
|
1341
1264
|
remain_num_req = len(self.rid_to_state)
|
1265
|
+
remaining_rids = list(self.rid_to_state.keys())
|
1342
1266
|
|
1343
1267
|
if self.server_status == ServerStatus.UnHealthy:
|
1344
1268
|
# if health check failed, we should exit immediately
|
1345
1269
|
logger.error(
|
1346
|
-
"Signal SIGTERM received while health check failed.
|
1347
|
-
remain_num_req,
|
1270
|
+
"Signal SIGTERM received while health check failed. Force exiting."
|
1348
1271
|
)
|
1349
1272
|
self.dump_requests_before_crash()
|
1350
1273
|
break
|
@@ -1352,13 +1275,12 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1352
1275
|
elif get_bool_env_var("SGL_FORCE_SHUTDOWN"):
|
1353
1276
|
# if force shutdown flag set, exit immediately
|
1354
1277
|
logger.error(
|
1355
|
-
"Signal SIGTERM received while force shutdown flag set. Force exiting
|
1356
|
-
remain_num_req,
|
1278
|
+
"Signal SIGTERM received while force shutdown flag set. Force exiting."
|
1357
1279
|
)
|
1358
1280
|
break
|
1359
1281
|
|
1360
1282
|
logger.info(
|
1361
|
-
f"Gracefully exiting...
|
1283
|
+
f"Gracefully exiting... Remaining number of requests {remain_num_req}. Remaining requests {remaining_rids=}."
|
1362
1284
|
)
|
1363
1285
|
if remain_num_req > 0:
|
1364
1286
|
await asyncio.sleep(5)
|
@@ -1379,7 +1301,10 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1379
1301
|
def _handle_batch_output(
|
1380
1302
|
self,
|
1381
1303
|
recv_obj: Union[
|
1382
|
-
|
1304
|
+
BatchStrOutput,
|
1305
|
+
BatchEmbeddingOutput,
|
1306
|
+
BatchMultimodalOutput,
|
1307
|
+
BatchTokenIDOutput,
|
1383
1308
|
],
|
1384
1309
|
):
|
1385
1310
|
for i, rid in enumerate(recv_obj.rids):
|
@@ -1413,7 +1338,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1413
1338
|
i,
|
1414
1339
|
)
|
1415
1340
|
|
1416
|
-
if not isinstance(recv_obj,
|
1341
|
+
if not isinstance(recv_obj, BatchEmbeddingOutput):
|
1417
1342
|
meta_info.update(
|
1418
1343
|
{
|
1419
1344
|
"completion_tokens": recv_obj.completion_tokens[i],
|
@@ -1424,7 +1349,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1424
1349
|
if getattr(recv_obj, "output_hidden_states", None):
|
1425
1350
|
meta_info["hidden_states"] = recv_obj.output_hidden_states[i]
|
1426
1351
|
|
1427
|
-
if isinstance(recv_obj,
|
1352
|
+
if isinstance(recv_obj, BatchStrOutput):
|
1428
1353
|
state.text += recv_obj.output_strs[i]
|
1429
1354
|
if state.obj.stream:
|
1430
1355
|
state.output_ids.extend(recv_obj.output_ids[i])
|
@@ -1439,7 +1364,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1439
1364
|
"output_ids": output_token_ids,
|
1440
1365
|
"meta_info": meta_info,
|
1441
1366
|
}
|
1442
|
-
elif isinstance(recv_obj,
|
1367
|
+
elif isinstance(recv_obj, BatchTokenIDOutput):
|
1443
1368
|
if self.server_args.stream_output and state.obj.stream:
|
1444
1369
|
state.output_ids.extend(recv_obj.output_ids[i])
|
1445
1370
|
output_token_ids = state.output_ids[state.last_output_offset :]
|
@@ -1452,10 +1377,10 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1452
1377
|
"output_ids": output_token_ids,
|
1453
1378
|
"meta_info": meta_info,
|
1454
1379
|
}
|
1455
|
-
elif isinstance(recv_obj,
|
1380
|
+
elif isinstance(recv_obj, BatchMultimodalOutput):
|
1456
1381
|
raise NotImplementedError("BatchMultimodalOut not implemented")
|
1457
1382
|
else:
|
1458
|
-
assert isinstance(recv_obj,
|
1383
|
+
assert isinstance(recv_obj, BatchEmbeddingOutput)
|
1459
1384
|
out_dict = {
|
1460
1385
|
"embedding": recv_obj.embeddings[i],
|
1461
1386
|
"meta_info": meta_info,
|
@@ -1494,7 +1419,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1494
1419
|
top_logprobs_num: int,
|
1495
1420
|
token_ids_logprob: List[int],
|
1496
1421
|
return_text_in_logprobs: bool,
|
1497
|
-
recv_obj:
|
1422
|
+
recv_obj: BatchStrOutput,
|
1498
1423
|
recv_obj_index: int,
|
1499
1424
|
):
|
1500
1425
|
if recv_obj.input_token_logprobs_val is None:
|
@@ -1612,17 +1537,17 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1612
1537
|
ret.append(None)
|
1613
1538
|
return ret
|
1614
1539
|
|
1615
|
-
def collect_metrics(self, state: ReqState, recv_obj:
|
1540
|
+
def collect_metrics(self, state: ReqState, recv_obj: BatchStrOutput, i: int):
|
1616
1541
|
completion_tokens = (
|
1617
1542
|
recv_obj.completion_tokens[i]
|
1618
1543
|
if getattr(recv_obj, "completion_tokens", None)
|
1619
1544
|
else 0
|
1620
1545
|
)
|
1621
1546
|
|
1622
|
-
|
1547
|
+
custom_labels = getattr(state.obj, "custom_labels", None)
|
1623
1548
|
labels = (
|
1624
|
-
{**self.metrics_collector.labels, **
|
1625
|
-
if
|
1549
|
+
{**self.metrics_collector.labels, **custom_labels}
|
1550
|
+
if custom_labels
|
1626
1551
|
else self.metrics_collector.labels
|
1627
1552
|
)
|
1628
1553
|
if (
|
@@ -1708,7 +1633,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1708
1633
|
|
1709
1634
|
asyncio.create_task(asyncio.to_thread(background_task))
|
1710
1635
|
|
1711
|
-
def _handle_abort_req(self, recv_obj):
|
1636
|
+
def _handle_abort_req(self, recv_obj: AbortReq):
|
1712
1637
|
if is_health_check_generate_req(recv_obj):
|
1713
1638
|
return
|
1714
1639
|
state = self.rid_to_state[recv_obj.rid]
|
@@ -1874,6 +1799,29 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
|
1874
1799
|
load_udpate_req = WatchLoadUpdateReq(loads=loads)
|
1875
1800
|
self.send_to_scheduler.send_pyobj(load_udpate_req)
|
1876
1801
|
|
1802
|
+
def _trace_request_start(
|
1803
|
+
self,
|
1804
|
+
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
1805
|
+
created_time: Optional[float] = None,
|
1806
|
+
):
|
1807
|
+
if obj.is_single:
|
1808
|
+
bootstrap_room = (
|
1809
|
+
obj.bootstrap_room if hasattr(obj, "bootstrap_room") else None
|
1810
|
+
)
|
1811
|
+
trace_req_start(obj.rid, bootstrap_room, ts=int(created_time * 1e9))
|
1812
|
+
trace_slice_start("", obj.rid, ts=int(created_time * 1e9), anonymous=True)
|
1813
|
+
else:
|
1814
|
+
for i in range(len(obj.rid)):
|
1815
|
+
bootstrap_room = (
|
1816
|
+
obj.bootstrap_room[i]
|
1817
|
+
if hasattr(obj, "bootstrap_room") and obj.bootstrap_room
|
1818
|
+
else None
|
1819
|
+
)
|
1820
|
+
trace_req_start(obj.rid[i], bootstrap_room, ts=int(created_time * 1e9))
|
1821
|
+
trace_slice_start(
|
1822
|
+
"", obj.rid[i], ts=int(created_time * 1e9), anonymous=True
|
1823
|
+
)
|
1824
|
+
|
1877
1825
|
|
1878
1826
|
class ServerStatus(Enum):
|
1879
1827
|
Up = "Up"
|
@@ -1919,7 +1867,7 @@ class SignalHandler:
|
|
1919
1867
|
|
1920
1868
|
def running_phase_sigquit_handler(self, signum=None, frame=None):
|
1921
1869
|
logger.error(
|
1922
|
-
"
|
1870
|
+
f"SIGQUIT received. {signum=}, {frame=}. It usually means one child failed."
|
1923
1871
|
)
|
1924
1872
|
self.tokenizer_manager.dump_requests_before_crash()
|
1925
1873
|
kill_process_tree(os.getpid())
|