sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +67 -43
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +88 -53
- sglang/srt/entrypoints/openai/protocol.py +7 -4
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +39 -19
- sglang/srt/entrypoints/openai/serving_completions.py +15 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -7
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +182 -49
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +68 -41
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +0 -18
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +200 -199
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +191 -139
- sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +260 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +18 -33
- sglang/srt/mem_cache/hiradix_cache.py +108 -48
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +121 -57
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +502 -77
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +75 -19
- sglang/srt/model_executor/model_runner.py +357 -30
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +346 -48
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +11 -2
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +60 -13
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +40 -9
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +355 -37
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +197 -112
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +46 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +12 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,24 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import logging
|
2
4
|
import time
|
3
5
|
from collections import defaultdict
|
4
|
-
from typing import List, Optional
|
6
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
7
|
+
|
8
|
+
import torch
|
5
9
|
|
6
10
|
from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch
|
7
11
|
from sglang.srt.disaggregation.utils import DisaggregationMode
|
12
|
+
from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
|
8
13
|
from sglang.srt.managers.schedule_policy import PrefillAdder
|
9
14
|
from sglang.srt.managers.scheduler import Req, ScheduleBatch
|
15
|
+
from sglang.srt.managers.utils import DPBalanceMeta
|
10
16
|
from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
|
11
17
|
from sglang.srt.utils import get_bool_env_var
|
12
18
|
|
19
|
+
if TYPE_CHECKING:
|
20
|
+
from sglang.srt.managers.scheduler import Scheduler
|
21
|
+
|
13
22
|
logger = logging.getLogger(__name__)
|
14
23
|
|
15
24
|
RECORD_STEP_TIME = get_bool_env_var("SGLANG_RECORD_STEP_TIME")
|
@@ -28,7 +37,9 @@ class KvMetrics:
|
|
28
37
|
|
29
38
|
|
30
39
|
class SchedulerMetricsMixin:
|
31
|
-
def init_metrics(
|
40
|
+
def init_metrics(
|
41
|
+
self: Scheduler, tp_rank: int, pp_rank: int, dp_rank: Optional[int]
|
42
|
+
):
|
32
43
|
self.last_gen_throughput: float = 0.0
|
33
44
|
self.last_input_throughput: float = 0.0
|
34
45
|
self.step_time_dict = defaultdict(list) # Dict[batch size -> step time]
|
@@ -50,14 +61,24 @@ class SchedulerMetricsMixin:
|
|
50
61
|
labels["dp_rank"] = dp_rank
|
51
62
|
self.metrics_collector = SchedulerMetricsCollector(labels=labels)
|
52
63
|
|
53
|
-
def
|
64
|
+
def init_dp_balance(self: Scheduler, dp_balance_meta: Optional[DPBalanceMeta]):
|
65
|
+
self.balance_meta = dp_balance_meta
|
66
|
+
if (
|
67
|
+
self.server_args.enable_dp_attention
|
68
|
+
and self.server_args.load_balance_method == "minimum_tokens"
|
69
|
+
):
|
70
|
+
assert dp_balance_meta is not None
|
71
|
+
|
72
|
+
self.recv_dp_balance_id_this_term = []
|
73
|
+
|
74
|
+
def init_kv_events(self: Scheduler, kv_events_config: Optional[str]):
|
54
75
|
if self.enable_kv_cache_events:
|
55
76
|
self.kv_event_publisher = EventPublisherFactory.create(
|
56
77
|
kv_events_config, self.attn_dp_rank
|
57
78
|
)
|
58
79
|
|
59
80
|
def log_prefill_stats(
|
60
|
-
self,
|
81
|
+
self: Scheduler,
|
61
82
|
adder: PrefillAdder,
|
62
83
|
can_run_list: List[Req],
|
63
84
|
running_bs: int,
|
@@ -138,7 +159,7 @@ class SchedulerMetricsMixin:
|
|
138
159
|
self._publish_kv_events()
|
139
160
|
|
140
161
|
def log_decode_stats(
|
141
|
-
self, can_run_cuda_graph: bool, running_batch: ScheduleBatch = None
|
162
|
+
self: Scheduler, can_run_cuda_graph: bool, running_batch: ScheduleBatch = None
|
142
163
|
):
|
143
164
|
batch = running_batch or self.running_batch
|
144
165
|
|
@@ -193,7 +214,7 @@ class SchedulerMetricsMixin:
|
|
193
214
|
msg += f"#retracted-req: {len(self.disagg_decode_prealloc_queue.retracted_queue)}, "
|
194
215
|
|
195
216
|
msg += (
|
196
|
-
f"cuda graph: {can_run_cuda_graph}, "
|
217
|
+
f"{'cpu graph' if self.device == 'cpu' else 'cuda graph'}: {can_run_cuda_graph}, "
|
197
218
|
f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
|
198
219
|
f"#queue-req: {len(self.waiting_queue)}, "
|
199
220
|
)
|
@@ -209,7 +230,7 @@ class SchedulerMetricsMixin:
|
|
209
230
|
self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
|
210
231
|
self.stats.spec_accept_length = spec_accept_length
|
211
232
|
self.stats.total_retracted_reqs = self.total_retracted_reqs
|
212
|
-
self.
|
233
|
+
self.stats.avg_request_queue_latency = 0.0
|
213
234
|
if self.disaggregation_mode == DisaggregationMode.DECODE:
|
214
235
|
self.stats.num_decode_prealloc_queue_reqs = len(
|
215
236
|
self.disagg_decode_prealloc_queue.queue
|
@@ -217,10 +238,11 @@ class SchedulerMetricsMixin:
|
|
217
238
|
self.stats.num_decode_transfer_queue_reqs = len(
|
218
239
|
self.disagg_decode_transfer_queue.queue
|
219
240
|
)
|
241
|
+
self.metrics_collector.log_stats(self.stats)
|
220
242
|
self._emit_kv_metrics()
|
221
243
|
self._publish_kv_events()
|
222
244
|
|
223
|
-
def _emit_kv_metrics(self):
|
245
|
+
def _emit_kv_metrics(self: Scheduler):
|
224
246
|
kv_metrics = KvMetrics()
|
225
247
|
kv_metrics.request_active_slots = self.stats.num_running_reqs
|
226
248
|
kv_metrics.request_total_slots = self.max_running_requests
|
@@ -236,9 +258,94 @@ class SchedulerMetricsMixin:
|
|
236
258
|
if not self.send_metrics_from_scheduler.closed:
|
237
259
|
self.send_metrics_from_scheduler.send_pyobj(kv_metrics)
|
238
260
|
|
239
|
-
def _publish_kv_events(self):
|
261
|
+
def _publish_kv_events(self: Scheduler):
|
240
262
|
if self.enable_kv_cache_events:
|
241
263
|
events = self.tree_cache.take_events()
|
242
264
|
if events:
|
243
265
|
batch = KVEventBatch(ts=time.time(), events=events)
|
244
266
|
self.kv_event_publisher.publish(batch)
|
267
|
+
|
268
|
+
def maybe_update_dp_balance_data(
|
269
|
+
self: Scheduler, recv_req: TokenizedGenerateReqInput
|
270
|
+
):
|
271
|
+
if (
|
272
|
+
self.server_args.enable_dp_attention
|
273
|
+
and self.server_args.load_balance_method == "minimum_tokens"
|
274
|
+
):
|
275
|
+
self.recv_dp_balance_id_this_term.append(recv_req.dp_balance_id)
|
276
|
+
|
277
|
+
def maybe_handle_dp_balance_data(self: Scheduler):
|
278
|
+
if (
|
279
|
+
self.server_args.load_balance_method == "minimum_tokens"
|
280
|
+
and self.forward_ct % 40 == 0
|
281
|
+
):
|
282
|
+
holding_tokens = self.get_load().num_tokens
|
283
|
+
|
284
|
+
new_recv_dp_balance_id_list, holding_token_list = (
|
285
|
+
self.gather_dp_balance_info(holding_tokens)
|
286
|
+
)
|
287
|
+
|
288
|
+
self.recv_dp_balance_id_this_term.clear()
|
289
|
+
if self.tp_rank == 0: # only first worker write info
|
290
|
+
self.write_shared_dp_balance_info(
|
291
|
+
new_recv_dp_balance_id_list, holding_token_list
|
292
|
+
)
|
293
|
+
|
294
|
+
def gather_dp_balance_info(
|
295
|
+
self: Scheduler, holding_tokens_list
|
296
|
+
) -> Union[None, List[List[int]]]:
|
297
|
+
"""gather recv_dp_balance_id_this_term and holding tokens per worker for dp balance"""
|
298
|
+
recv_list = self.recv_dp_balance_id_this_term
|
299
|
+
assert len(recv_list) <= 511, (
|
300
|
+
"The number of requests received this round is too large. "
|
301
|
+
"Please increase gather_tensor_size and onfly_info_size."
|
302
|
+
)
|
303
|
+
# The maximum size of the tensor used for gathering data from all workers.
|
304
|
+
gather_tensor_size = 512
|
305
|
+
|
306
|
+
# recv_tensor: | holding_tokens | len(recv_dp_balance_id) | recv_dp_balance_ids
|
307
|
+
recv_tensor = torch.zeros(gather_tensor_size, dtype=torch.int32)
|
308
|
+
recv_tensor[0] = holding_tokens_list
|
309
|
+
recv_tensor[1] = len(recv_list) # The first element is the length of the list.
|
310
|
+
recv_tensor[2 : len(recv_list) + 2] = torch.tensor(recv_list, dtype=torch.int32)
|
311
|
+
|
312
|
+
if self.tp_rank == 0:
|
313
|
+
gathered_list = [
|
314
|
+
torch.zeros(gather_tensor_size, dtype=torch.int32)
|
315
|
+
for _ in range(self.balance_meta.num_workers)
|
316
|
+
]
|
317
|
+
else:
|
318
|
+
gathered_list = None
|
319
|
+
|
320
|
+
torch.distributed.gather(recv_tensor, gathered_list, group=self.tp_cpu_group)
|
321
|
+
|
322
|
+
gathered_id_list_per_worker = None
|
323
|
+
if self.tp_rank == 0:
|
324
|
+
gathered_id_list_per_worker = []
|
325
|
+
holding_tokens_list = []
|
326
|
+
for tensor in gathered_list:
|
327
|
+
holding_tokens_list.append(tensor[0].item())
|
328
|
+
list_length = tensor[1].item()
|
329
|
+
gathered_id_list_per_worker.append(tensor[2 : list_length + 2].tolist())
|
330
|
+
|
331
|
+
return gathered_id_list_per_worker, holding_tokens_list
|
332
|
+
|
333
|
+
def write_shared_dp_balance_info(self: Scheduler, new_recv_rid_lists, local_tokens):
|
334
|
+
meta = self.balance_meta
|
335
|
+
|
336
|
+
with meta.mutex:
|
337
|
+
onfly_list: List[Dict[int, int]] = meta.get_shared_onfly()
|
338
|
+
assert len(new_recv_rid_lists) == len(onfly_list), "num_worker not equal"
|
339
|
+
# 1.Check if the rid received by each worker this round is present in onfly.
|
340
|
+
# If it is, remove the corresponding onfly item.
|
341
|
+
worker_id = 0
|
342
|
+
for new_recv_rids, on_fly_reqs in zip(new_recv_rid_lists, onfly_list):
|
343
|
+
for new_recv_rid in new_recv_rids:
|
344
|
+
assert (
|
345
|
+
new_recv_rid in on_fly_reqs
|
346
|
+
), f"{new_recv_rid=} not in {worker_id=} {on_fly_reqs=}, data consistency is wrong"
|
347
|
+
del on_fly_reqs[new_recv_rid]
|
348
|
+
worker_id += 1
|
349
|
+
# 2. Atomically write local_tokens and onfly into shm under the mutex
|
350
|
+
meta.set_shared_onfly_info(onfly_list)
|
351
|
+
meta.set_shared_local_tokens(local_tokens)
|
@@ -5,6 +5,8 @@ import threading
|
|
5
5
|
import time
|
6
6
|
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
7
7
|
|
8
|
+
import torch
|
9
|
+
|
8
10
|
from sglang.srt.disaggregation.utils import DisaggregationMode
|
9
11
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
10
12
|
from sglang.srt.managers.io_struct import AbortReq, BatchEmbeddingOut, BatchTokenIDOut
|
@@ -71,6 +73,7 @@ class SchedulerOutputProcessorMixin:
|
|
71
73
|
|
72
74
|
# Check finish conditions
|
73
75
|
logprob_pt = 0
|
76
|
+
|
74
77
|
for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
|
75
78
|
if req.is_retracted:
|
76
79
|
continue
|
@@ -99,6 +102,7 @@ class SchedulerOutputProcessorMixin:
|
|
99
102
|
extend_logprob_start_len = extend_logprob_start_len_per_req[i]
|
100
103
|
extend_input_len = extend_input_len_per_req[i]
|
101
104
|
num_input_logprobs = extend_input_len - extend_logprob_start_len
|
105
|
+
|
102
106
|
if req.return_logprob:
|
103
107
|
self.add_logprob_return_values(
|
104
108
|
i,
|
@@ -441,27 +445,59 @@ class SchedulerOutputProcessorMixin:
|
|
441
445
|
output: LogitsProcessorOutput,
|
442
446
|
):
|
443
447
|
"""Attach logprobs to the return values."""
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
448
|
+
if output.next_token_logprobs is not None:
|
449
|
+
req.output_token_logprobs_val.append(output.next_token_logprobs[i])
|
450
|
+
req.output_token_logprobs_idx.append(next_token_ids[i])
|
451
|
+
|
452
|
+
# Only add input logprobs if there are input tokens to process
|
453
|
+
# Note: For prefill-only requests with default logprob_start_len, this will be 0,
|
454
|
+
# meaning we only compute output logprobs (which is the intended behavior)
|
455
|
+
if num_input_logprobs > 0:
|
456
|
+
self.add_input_logprob_return_values(
|
457
|
+
i, req, output, pt, num_input_logprobs, last_prefill_chunk=True
|
458
|
+
)
|
459
|
+
else:
|
460
|
+
self._initialize_empty_logprob_containers(req)
|
450
461
|
|
451
462
|
if req.top_logprobs_num > 0:
|
452
463
|
req.output_top_logprobs_val.append(output.next_token_top_logprobs_val[i])
|
453
464
|
req.output_top_logprobs_idx.append(output.next_token_top_logprobs_idx[i])
|
454
465
|
|
455
|
-
if
|
456
|
-
req.
|
457
|
-
|
458
|
-
|
466
|
+
if (
|
467
|
+
req.token_ids_logprob is not None
|
468
|
+
and output.next_token_token_ids_logprobs_val is not None
|
469
|
+
):
|
470
|
+
# Convert GPU tensor to list if needed
|
471
|
+
logprobs_val = output.next_token_token_ids_logprobs_val[i]
|
472
|
+
if isinstance(logprobs_val, torch.Tensor):
|
473
|
+
logprobs_val = logprobs_val.tolist()
|
474
|
+
req.output_token_ids_logprobs_val.append(logprobs_val)
|
459
475
|
req.output_token_ids_logprobs_idx.append(
|
460
476
|
output.next_token_token_ids_logprobs_idx[i]
|
461
477
|
)
|
462
478
|
|
463
479
|
return num_input_logprobs
|
464
480
|
|
481
|
+
def _initialize_empty_logprob_containers(self, req: Req) -> None:
|
482
|
+
"""
|
483
|
+
Initialize logprob fields to empty lists if unset.
|
484
|
+
|
485
|
+
This is needed for prefill-only requests where the normal initialization
|
486
|
+
flow might be bypassed, but downstream code expects these fields to be lists.
|
487
|
+
"""
|
488
|
+
if req.input_token_logprobs_val is None:
|
489
|
+
req.input_token_logprobs_val = []
|
490
|
+
if req.input_token_logprobs_idx is None:
|
491
|
+
req.input_token_logprobs_idx = []
|
492
|
+
if req.input_top_logprobs_val is None:
|
493
|
+
req.input_top_logprobs_val = []
|
494
|
+
if req.input_top_logprobs_idx is None:
|
495
|
+
req.input_top_logprobs_idx = []
|
496
|
+
if req.input_token_ids_logprobs_val is None:
|
497
|
+
req.input_token_ids_logprobs_val = []
|
498
|
+
if req.input_token_ids_logprobs_idx is None:
|
499
|
+
req.input_token_ids_logprobs_idx = []
|
500
|
+
|
465
501
|
def stream_output(
|
466
502
|
self: Scheduler,
|
467
503
|
reqs: List[Req],
|
@@ -700,6 +736,8 @@ class SchedulerOutputProcessorMixin:
|
|
700
736
|
output_token_ids_logprobs_val,
|
701
737
|
output_token_ids_logprobs_idx,
|
702
738
|
output_hidden_states,
|
739
|
+
placeholder_tokens_idx=None,
|
740
|
+
placeholder_tokens_val=None,
|
703
741
|
)
|
704
742
|
)
|
705
743
|
|
@@ -719,6 +757,12 @@ class SchedulerOutputProcessorMixin:
|
|
719
757
|
cached_tokens.append(req.cached_tokens)
|
720
758
|
self.send_to_detokenizer.send_pyobj(
|
721
759
|
BatchEmbeddingOut(
|
722
|
-
rids,
|
760
|
+
rids,
|
761
|
+
finished_reasons,
|
762
|
+
embeddings,
|
763
|
+
prompt_tokens,
|
764
|
+
cached_tokens,
|
765
|
+
placeholder_tokens_idx=None,
|
766
|
+
placeholder_tokens_val=None,
|
723
767
|
)
|
724
768
|
)
|
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
|
|
26
26
|
|
27
27
|
class SchedulerProfilerMixin:
|
28
28
|
|
29
|
-
def
|
29
|
+
def init_profiler(self):
|
30
30
|
self.torch_profiler = None
|
31
31
|
self.torch_profiler_output_dir: Optional[str] = None
|
32
32
|
self.profiler_activities: Optional[List[str]] = None
|
@@ -24,20 +24,20 @@ import os
|
|
24
24
|
import re
|
25
25
|
from typing import Optional
|
26
26
|
|
27
|
-
from sglang.srt.code_completion_parser import (
|
27
|
+
from sglang.srt.parser.code_completion_parser import (
|
28
28
|
CompletionTemplate,
|
29
29
|
FimPosition,
|
30
30
|
completion_template_exists,
|
31
31
|
register_completion_template,
|
32
32
|
)
|
33
|
-
from sglang.srt.conversation import (
|
33
|
+
from sglang.srt.parser.conversation import (
|
34
34
|
Conversation,
|
35
35
|
SeparatorStyle,
|
36
36
|
chat_template_exists,
|
37
37
|
get_conv_template_by_model_path,
|
38
38
|
register_conv_template,
|
39
39
|
)
|
40
|
-
from sglang.srt.jinja_template_utils import detect_jinja_template_content_format
|
40
|
+
from sglang.srt.parser.jinja_template_utils import detect_jinja_template_content_format
|
41
41
|
|
42
42
|
logger = logging.getLogger(__name__)
|
43
43
|
|