sglang 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +251 -26
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +37 -7
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +6 -4
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -420
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +6 -4
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +94 -58
- sglang/srt/entrypoints/engine.py +34 -14
- sglang/srt/entrypoints/http_server.py +172 -47
- sglang/srt/entrypoints/openai/protocol.py +63 -3
- sglang/srt/entrypoints/openai/serving_base.py +6 -2
- sglang/srt/entrypoints/openai/serving_chat.py +34 -19
- sglang/srt/entrypoints/openai/serving_completions.py +10 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/eplb/eplb_manager.py +28 -4
- sglang/srt/eplb/expert_distribution.py +55 -15
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +12 -0
- sglang/srt/layers/activation.py +44 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +250 -112
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/layernorm.py +54 -12
- sglang/srt/layers/logits_processor.py +10 -3
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +110 -49
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +43 -12
- sglang/srt/layers/moe/utils.py +6 -5
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +43 -29
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +107 -40
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +77 -45
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +60 -42
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +83 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +28 -19
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/cache_controller.py +242 -278
- sglang/srt/managers/data_parallel_controller.py +30 -15
- sglang/srt/managers/detokenizer_manager.py +13 -2
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +160 -11
- sglang/srt/managers/mm_utils.py +6 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
- sglang/srt/managers/schedule_batch.py +27 -44
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +90 -115
- sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
- sglang/srt/managers/tokenizer_manager.py +41 -477
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +24 -22
- sglang/srt/mem_cache/hiradix_cache.py +184 -101
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +324 -41
- sglang/srt/mem_cache/memory_pool_host.py +25 -18
- sglang/srt/mem_cache/radix_cache.py +5 -6
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +149 -12
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +74 -19
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1 -3
- sglang/srt/metrics/collector.py +484 -63
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +72 -18
- sglang/srt/model_executor/model_runner.py +189 -31
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +33 -28
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/deepseek_v2.py +311 -50
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/gpt_oss.py +5 -18
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +17 -0
- sglang/srt/models/longcat_flash.py +1026 -0
- sglang/srt/models/longcat_flash_nextn.py +699 -0
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +33 -3
- sglang/srt/models/qwen2_5_vl.py +90 -42
- sglang/srt/models/qwen2_moe.py +79 -14
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/qwen3_next.py +1039 -0
- sglang/srt/models/qwen3_next_mtp.py +109 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +297 -79
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_worker.py +216 -120
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/utils.py +37 -2
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +181 -8
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_utils.py +25 -1
- sglang/utils.py +5 -0
- sglang/version.py +1 -1
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/METADATA +11 -9
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/RECORD +243 -194
- sglang/srt/disaggregation/launch_lb.py +0 -131
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,491 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
import time
|
7
|
+
from collections import deque
|
8
|
+
from typing import (
|
9
|
+
TYPE_CHECKING,
|
10
|
+
Any,
|
11
|
+
Deque,
|
12
|
+
Dict,
|
13
|
+
Generic,
|
14
|
+
List,
|
15
|
+
Optional,
|
16
|
+
Tuple,
|
17
|
+
TypeVar,
|
18
|
+
)
|
19
|
+
|
20
|
+
import fastapi
|
21
|
+
|
22
|
+
from sglang.srt.managers.io_struct import (
|
23
|
+
ClearHiCacheReqInput,
|
24
|
+
ClearHiCacheReqOutput,
|
25
|
+
ExpertDistributionReq,
|
26
|
+
ExpertDistributionReqOutput,
|
27
|
+
FlushCacheReqInput,
|
28
|
+
FlushCacheReqOutput,
|
29
|
+
GetInternalStateReq,
|
30
|
+
GetInternalStateReqOutput,
|
31
|
+
GetWeightsByNameReqInput,
|
32
|
+
GetWeightsByNameReqOutput,
|
33
|
+
InitWeightsUpdateGroupReqInput,
|
34
|
+
InitWeightsUpdateGroupReqOutput,
|
35
|
+
LoadLoRAAdapterReqInput,
|
36
|
+
LoadLoRAAdapterReqOutput,
|
37
|
+
LoRAUpdateResult,
|
38
|
+
MultiTokenizerWrapper,
|
39
|
+
ProfileReq,
|
40
|
+
ProfileReqOutput,
|
41
|
+
ProfileReqType,
|
42
|
+
ReleaseMemoryOccupationReqInput,
|
43
|
+
ReleaseMemoryOccupationReqOutput,
|
44
|
+
ResumeMemoryOccupationReqInput,
|
45
|
+
ResumeMemoryOccupationReqOutput,
|
46
|
+
SetInternalStateReq,
|
47
|
+
SetInternalStateReqOutput,
|
48
|
+
SlowDownReqInput,
|
49
|
+
SlowDownReqOutput,
|
50
|
+
UnloadLoRAAdapterReqInput,
|
51
|
+
UnloadLoRAAdapterReqOutput,
|
52
|
+
UpdateWeightsFromDistributedReqInput,
|
53
|
+
UpdateWeightsFromDistributedReqOutput,
|
54
|
+
UpdateWeightsFromTensorReqInput,
|
55
|
+
UpdateWeightsFromTensorReqOutput,
|
56
|
+
)
|
57
|
+
from sglang.srt.server_args import LoRARef, ServerArgs
|
58
|
+
from sglang.srt.utils import get_bool_env_var
|
59
|
+
from sglang.utils import TypeBasedDispatcher
|
60
|
+
|
61
|
+
if TYPE_CHECKING:
|
62
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
63
|
+
|
64
|
+
T = TypeVar("T")
|
65
|
+
|
66
|
+
logger = logging.getLogger(__name__)
|
67
|
+
|
68
|
+
|
69
|
+
class _Communicator(Generic[T]):
|
70
|
+
"""Note: The communicator now only run up to 1 in-flight request at any time."""
|
71
|
+
|
72
|
+
enable_multi_tokenizer = False
|
73
|
+
|
74
|
+
def __init__(self, sender, fan_out: int):
|
75
|
+
self._sender = sender
|
76
|
+
self._fan_out = fan_out
|
77
|
+
self._result_event: Optional[asyncio.Event] = None
|
78
|
+
self._result_values: Optional[List[T]] = None
|
79
|
+
self._ready_queue: Deque[asyncio.Future] = deque()
|
80
|
+
|
81
|
+
async def __call__(self, obj):
|
82
|
+
ready_event = asyncio.Event()
|
83
|
+
if self._result_event is not None or len(self._ready_queue) > 0:
|
84
|
+
self._ready_queue.append(ready_event)
|
85
|
+
await ready_event.wait()
|
86
|
+
assert self._result_event is None
|
87
|
+
assert self._result_values is None
|
88
|
+
|
89
|
+
if obj:
|
90
|
+
if _Communicator.enable_multi_tokenizer:
|
91
|
+
obj = MultiTokenizerWrapper(worker_id=os.getpid(), obj=obj)
|
92
|
+
self._sender.send_pyobj(obj)
|
93
|
+
|
94
|
+
self._result_event = asyncio.Event()
|
95
|
+
self._result_values = []
|
96
|
+
await self._result_event.wait()
|
97
|
+
result_values = self._result_values
|
98
|
+
self._result_event = self._result_values = None
|
99
|
+
|
100
|
+
if len(self._ready_queue) > 0:
|
101
|
+
self._ready_queue.popleft().set()
|
102
|
+
|
103
|
+
return result_values
|
104
|
+
|
105
|
+
def handle_recv(self, recv_obj: T):
|
106
|
+
self._result_values.append(recv_obj)
|
107
|
+
if len(self._result_values) == self._fan_out:
|
108
|
+
self._result_event.set()
|
109
|
+
|
110
|
+
|
111
|
+
class TokenizerCommunicatorMixin:
|
112
|
+
"""Mixin class for TokenizerManager to handle communication with the scheduler."""
|
113
|
+
|
114
|
+
def init_communicators(self: TokenizerManager, server_args: ServerArgs):
|
115
|
+
# Communicators
|
116
|
+
self.init_weights_update_group_communicator = _Communicator(
|
117
|
+
self.send_to_scheduler, server_args.dp_size
|
118
|
+
)
|
119
|
+
self.update_weights_from_distributed_communicator = _Communicator(
|
120
|
+
self.send_to_scheduler, server_args.dp_size
|
121
|
+
)
|
122
|
+
self.update_weights_from_tensor_communicator = _Communicator(
|
123
|
+
self.send_to_scheduler, server_args.dp_size
|
124
|
+
)
|
125
|
+
self.get_weights_by_name_communicator = _Communicator(
|
126
|
+
self.send_to_scheduler, server_args.dp_size
|
127
|
+
)
|
128
|
+
self.release_memory_occupation_communicator = _Communicator(
|
129
|
+
self.send_to_scheduler, server_args.dp_size
|
130
|
+
)
|
131
|
+
self.resume_memory_occupation_communicator = _Communicator(
|
132
|
+
self.send_to_scheduler, server_args.dp_size
|
133
|
+
)
|
134
|
+
self.slow_down_communicator = _Communicator(
|
135
|
+
self.send_to_scheduler, server_args.dp_size
|
136
|
+
)
|
137
|
+
self.flush_cache_communicator = _Communicator(
|
138
|
+
self.send_to_scheduler, server_args.dp_size
|
139
|
+
)
|
140
|
+
self.clear_hicache_storage_communicator = _Communicator(
|
141
|
+
self.send_to_scheduler, server_args.dp_size
|
142
|
+
)
|
143
|
+
self.profile_communicator = _Communicator(
|
144
|
+
self.send_to_scheduler, server_args.dp_size
|
145
|
+
)
|
146
|
+
self.get_internal_state_communicator = _Communicator(
|
147
|
+
self.send_to_scheduler, server_args.dp_size
|
148
|
+
)
|
149
|
+
self.set_internal_state_communicator = _Communicator(
|
150
|
+
self.send_to_scheduler, server_args.dp_size
|
151
|
+
)
|
152
|
+
self.expert_distribution_communicator = _Communicator(
|
153
|
+
self.send_to_scheduler, server_args.dp_size
|
154
|
+
)
|
155
|
+
self.update_lora_adapter_communicator = _Communicator(
|
156
|
+
self.send_to_scheduler, server_args.dp_size
|
157
|
+
)
|
158
|
+
|
159
|
+
self._result_dispatcher += self._get_communicator_dispatcher()
|
160
|
+
|
161
|
+
def _get_communicator_dispatcher(self: TokenizerManager):
|
162
|
+
return TypeBasedDispatcher(
|
163
|
+
[
|
164
|
+
(
|
165
|
+
InitWeightsUpdateGroupReqOutput,
|
166
|
+
self.init_weights_update_group_communicator.handle_recv,
|
167
|
+
),
|
168
|
+
(
|
169
|
+
UpdateWeightsFromDistributedReqOutput,
|
170
|
+
self.update_weights_from_distributed_communicator.handle_recv,
|
171
|
+
),
|
172
|
+
(
|
173
|
+
UpdateWeightsFromTensorReqOutput,
|
174
|
+
self.update_weights_from_tensor_communicator.handle_recv,
|
175
|
+
),
|
176
|
+
(
|
177
|
+
GetWeightsByNameReqOutput,
|
178
|
+
self.get_weights_by_name_communicator.handle_recv,
|
179
|
+
),
|
180
|
+
(
|
181
|
+
ReleaseMemoryOccupationReqOutput,
|
182
|
+
self.release_memory_occupation_communicator.handle_recv,
|
183
|
+
),
|
184
|
+
(
|
185
|
+
ResumeMemoryOccupationReqOutput,
|
186
|
+
self.resume_memory_occupation_communicator.handle_recv,
|
187
|
+
),
|
188
|
+
(
|
189
|
+
SlowDownReqOutput,
|
190
|
+
self.slow_down_communicator.handle_recv,
|
191
|
+
),
|
192
|
+
(
|
193
|
+
ClearHiCacheReqOutput,
|
194
|
+
self.clear_hicache_storage_communicator.handle_recv,
|
195
|
+
),
|
196
|
+
(
|
197
|
+
FlushCacheReqOutput,
|
198
|
+
self.flush_cache_communicator.handle_recv,
|
199
|
+
),
|
200
|
+
(
|
201
|
+
ProfileReqOutput,
|
202
|
+
self.profile_communicator.handle_recv,
|
203
|
+
),
|
204
|
+
(
|
205
|
+
GetInternalStateReqOutput,
|
206
|
+
self.get_internal_state_communicator.handle_recv,
|
207
|
+
),
|
208
|
+
(
|
209
|
+
SetInternalStateReqOutput,
|
210
|
+
self.set_internal_state_communicator.handle_recv,
|
211
|
+
),
|
212
|
+
(
|
213
|
+
ExpertDistributionReqOutput,
|
214
|
+
self.expert_distribution_communicator.handle_recv,
|
215
|
+
),
|
216
|
+
(
|
217
|
+
LoRAUpdateResult,
|
218
|
+
self.update_lora_adapter_communicator.handle_recv,
|
219
|
+
),
|
220
|
+
]
|
221
|
+
)
|
222
|
+
|
223
|
+
async def flush_cache(self: TokenizerManager) -> FlushCacheReqOutput:
|
224
|
+
return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
|
225
|
+
|
226
|
+
async def clear_hicache_storage(self: TokenizerManager) -> ClearHiCacheReqOutput:
|
227
|
+
"""Clear the hierarchical cache storage."""
|
228
|
+
# Delegate to the scheduler to handle HiCacheStorage clearing
|
229
|
+
return (await self.clear_hicache_storage_communicator(ClearHiCacheReqInput()))[
|
230
|
+
0
|
231
|
+
]
|
232
|
+
|
233
|
+
async def start_profile(
|
234
|
+
self: TokenizerManager,
|
235
|
+
output_dir: Optional[str] = None,
|
236
|
+
start_step: Optional[int] = None,
|
237
|
+
num_steps: Optional[int] = None,
|
238
|
+
activities: Optional[List[str]] = None,
|
239
|
+
with_stack: Optional[bool] = None,
|
240
|
+
record_shapes: Optional[bool] = None,
|
241
|
+
profile_by_stage: bool = False,
|
242
|
+
):
|
243
|
+
self.auto_create_handle_loop()
|
244
|
+
env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
|
245
|
+
with_stack = False if with_stack is False or env_with_stack is False else True
|
246
|
+
req = ProfileReq(
|
247
|
+
type=ProfileReqType.START_PROFILE,
|
248
|
+
output_dir=output_dir,
|
249
|
+
start_step=start_step,
|
250
|
+
num_steps=num_steps,
|
251
|
+
activities=activities,
|
252
|
+
with_stack=with_stack,
|
253
|
+
record_shapes=record_shapes,
|
254
|
+
profile_by_stage=profile_by_stage,
|
255
|
+
profile_id=str(time.time()),
|
256
|
+
)
|
257
|
+
return await self._execute_profile(req)
|
258
|
+
|
259
|
+
async def stop_profile(self: TokenizerManager):
|
260
|
+
self.auto_create_handle_loop()
|
261
|
+
req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
|
262
|
+
return await self._execute_profile(req)
|
263
|
+
|
264
|
+
async def _execute_profile(self: TokenizerManager, req: ProfileReq):
|
265
|
+
result = (await self.profile_communicator(req))[0]
|
266
|
+
if not result.success:
|
267
|
+
raise RuntimeError(result.message)
|
268
|
+
return result
|
269
|
+
|
270
|
+
async def start_expert_distribution_record(self: TokenizerManager):
|
271
|
+
self.auto_create_handle_loop()
|
272
|
+
await self.expert_distribution_communicator(ExpertDistributionReq.START_RECORD)
|
273
|
+
|
274
|
+
async def stop_expert_distribution_record(self: TokenizerManager):
|
275
|
+
self.auto_create_handle_loop()
|
276
|
+
await self.expert_distribution_communicator(ExpertDistributionReq.STOP_RECORD)
|
277
|
+
|
278
|
+
async def dump_expert_distribution_record(self: TokenizerManager):
|
279
|
+
self.auto_create_handle_loop()
|
280
|
+
await self.expert_distribution_communicator(ExpertDistributionReq.DUMP_RECORD)
|
281
|
+
|
282
|
+
async def init_weights_update_group(
|
283
|
+
self: TokenizerManager,
|
284
|
+
obj: InitWeightsUpdateGroupReqInput,
|
285
|
+
request: Optional[fastapi.Request] = None,
|
286
|
+
) -> Tuple[bool, str]:
|
287
|
+
self.auto_create_handle_loop()
|
288
|
+
assert (
|
289
|
+
self.server_args.dp_size == 1
|
290
|
+
), "dp_size must be 1 for init parameter update group"
|
291
|
+
result = (await self.init_weights_update_group_communicator(obj))[0]
|
292
|
+
return result.success, result.message
|
293
|
+
|
294
|
+
async def update_weights_from_distributed(
|
295
|
+
self: TokenizerManager,
|
296
|
+
obj: UpdateWeightsFromDistributedReqInput,
|
297
|
+
request: Optional[fastapi.Request] = None,
|
298
|
+
) -> Tuple[bool, str]:
|
299
|
+
self.auto_create_handle_loop()
|
300
|
+
assert (
|
301
|
+
self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
|
302
|
+
), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
|
303
|
+
|
304
|
+
if obj.abort_all_requests:
|
305
|
+
self.abort_request(abort_all=True)
|
306
|
+
|
307
|
+
# This means that weight sync
|
308
|
+
# cannot run while requests are in progress.
|
309
|
+
async with self.model_update_lock.writer_lock:
|
310
|
+
result = (await self.update_weights_from_distributed_communicator(obj))[0]
|
311
|
+
return result.success, result.message
|
312
|
+
|
313
|
+
async def update_weights_from_tensor(
|
314
|
+
self: TokenizerManager,
|
315
|
+
obj: UpdateWeightsFromTensorReqInput,
|
316
|
+
request: Optional[fastapi.Request] = None,
|
317
|
+
) -> Tuple[bool, str]:
|
318
|
+
self.auto_create_handle_loop()
|
319
|
+
assert (
|
320
|
+
self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
|
321
|
+
), "dp_size must be 1 or dp attention must be enabled for update weights from tensor"
|
322
|
+
|
323
|
+
if obj.abort_all_requests:
|
324
|
+
self.abort_request(abort_all=True)
|
325
|
+
|
326
|
+
# This means that weight sync
|
327
|
+
# cannot run while requests are in progress.
|
328
|
+
async with self.model_update_lock.writer_lock:
|
329
|
+
result = (await self.update_weights_from_tensor_communicator(obj))[0]
|
330
|
+
return result.success, result.message
|
331
|
+
|
332
|
+
async def load_lora_adapter(
|
333
|
+
self: TokenizerManager,
|
334
|
+
obj: LoadLoRAAdapterReqInput,
|
335
|
+
_: Optional[fastapi.Request] = None,
|
336
|
+
) -> LoadLoRAAdapterReqOutput:
|
337
|
+
self.auto_create_handle_loop()
|
338
|
+
|
339
|
+
try:
|
340
|
+
if not self.server_args.enable_lora:
|
341
|
+
raise ValueError(
|
342
|
+
"LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
|
343
|
+
)
|
344
|
+
|
345
|
+
# TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
|
346
|
+
# with dp_size > 1.
|
347
|
+
assert (
|
348
|
+
self.server_args.dp_size == 1
|
349
|
+
), "dp_size must be 1 for dynamic lora loading"
|
350
|
+
logger.info(
|
351
|
+
"Start load Lora adapter. Lora name=%s, path=%s",
|
352
|
+
obj.lora_name,
|
353
|
+
obj.lora_path,
|
354
|
+
)
|
355
|
+
|
356
|
+
async with self.lora_update_lock:
|
357
|
+
if (
|
358
|
+
self.server_args.max_loaded_loras is not None
|
359
|
+
and self.lora_registry.num_registered_loras
|
360
|
+
>= self.server_args.max_loaded_loras
|
361
|
+
):
|
362
|
+
raise ValueError(
|
363
|
+
f"Cannot load LoRA adapter {obj.lora_name} at path {obj.lora_path}. "
|
364
|
+
f"Maximum number of loaded LoRA adapters is {self.server_args.max_loaded_loras}. "
|
365
|
+
"Please unload some LoRA adapters before loading new ones."
|
366
|
+
)
|
367
|
+
|
368
|
+
# Generate new uniquely identifiable LoRARef object.
|
369
|
+
new_adapter = LoRARef(
|
370
|
+
lora_name=obj.lora_name,
|
371
|
+
lora_path=obj.lora_path,
|
372
|
+
pinned=obj.pinned,
|
373
|
+
)
|
374
|
+
|
375
|
+
# Trigger the actual loading operation at the backend processes.
|
376
|
+
obj.lora_id = new_adapter.lora_id
|
377
|
+
result = (await self.update_lora_adapter_communicator(obj))[0]
|
378
|
+
|
379
|
+
# Register the LoRA adapter only after loading is successful.
|
380
|
+
if result.success:
|
381
|
+
await self.lora_registry.register(new_adapter)
|
382
|
+
|
383
|
+
return result
|
384
|
+
except ValueError as e:
|
385
|
+
return LoadLoRAAdapterReqOutput(
|
386
|
+
success=False,
|
387
|
+
error_message=str(e),
|
388
|
+
)
|
389
|
+
|
390
|
+
async def unload_lora_adapter(
|
391
|
+
self: TokenizerManager,
|
392
|
+
obj: UnloadLoRAAdapterReqInput,
|
393
|
+
_: Optional[fastapi.Request] = None,
|
394
|
+
) -> UnloadLoRAAdapterReqOutput:
|
395
|
+
self.auto_create_handle_loop()
|
396
|
+
|
397
|
+
try:
|
398
|
+
if not self.server_args.enable_lora:
|
399
|
+
raise ValueError(
|
400
|
+
"LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
|
401
|
+
)
|
402
|
+
|
403
|
+
assert (
|
404
|
+
obj.lora_name is not None
|
405
|
+
), "lora_name must be provided to unload LoRA adapter"
|
406
|
+
|
407
|
+
# TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
|
408
|
+
# with dp_size > 1.
|
409
|
+
assert (
|
410
|
+
self.server_args.dp_size == 1
|
411
|
+
), "dp_size must be 1 for dynamic lora loading"
|
412
|
+
logger.info(
|
413
|
+
"Start unload Lora adapter. Lora name=%s",
|
414
|
+
obj.lora_name,
|
415
|
+
)
|
416
|
+
|
417
|
+
async with self.lora_update_lock:
|
418
|
+
# Unregister the LoRA adapter from the registry to stop new requests for this adapter
|
419
|
+
# from being started.
|
420
|
+
lora_id = await self.lora_registry.unregister(obj.lora_name)
|
421
|
+
obj.lora_id = lora_id
|
422
|
+
|
423
|
+
# Initiate the actual unloading operation at the backend processes only after all
|
424
|
+
# ongoing requests using this LoRA adapter are finished.
|
425
|
+
await self.lora_registry.wait_for_unload(lora_id)
|
426
|
+
result = (await self.update_lora_adapter_communicator(obj))[0]
|
427
|
+
|
428
|
+
return result
|
429
|
+
except ValueError as e:
|
430
|
+
return UnloadLoRAAdapterReqOutput(success=False, error_message=str(e))
|
431
|
+
|
432
|
+
async def get_weights_by_name(
|
433
|
+
self: TokenizerManager,
|
434
|
+
obj: GetWeightsByNameReqInput,
|
435
|
+
request: Optional[fastapi.Request] = None,
|
436
|
+
):
|
437
|
+
self.auto_create_handle_loop()
|
438
|
+
results = await self.get_weights_by_name_communicator(obj)
|
439
|
+
all_parameters = [r.parameter for r in results]
|
440
|
+
if self.server_args.dp_size == 1:
|
441
|
+
return all_parameters[0]
|
442
|
+
else:
|
443
|
+
return all_parameters
|
444
|
+
|
445
|
+
async def release_memory_occupation(
|
446
|
+
self: TokenizerManager,
|
447
|
+
obj: ReleaseMemoryOccupationReqInput,
|
448
|
+
request: Optional[fastapi.Request] = None,
|
449
|
+
):
|
450
|
+
self.auto_create_handle_loop()
|
451
|
+
await self.release_memory_occupation_communicator(obj)
|
452
|
+
|
453
|
+
async def resume_memory_occupation(
|
454
|
+
self: TokenizerManager,
|
455
|
+
obj: ResumeMemoryOccupationReqInput,
|
456
|
+
request: Optional[fastapi.Request] = None,
|
457
|
+
):
|
458
|
+
self.auto_create_handle_loop()
|
459
|
+
await self.resume_memory_occupation_communicator(obj)
|
460
|
+
|
461
|
+
async def slow_down(
|
462
|
+
self: TokenizerManager,
|
463
|
+
obj: SlowDownReqInput,
|
464
|
+
request: Optional[fastapi.Request] = None,
|
465
|
+
):
|
466
|
+
self.auto_create_handle_loop()
|
467
|
+
await self.slow_down_communicator(obj)
|
468
|
+
|
469
|
+
async def get_internal_state(self: TokenizerManager) -> List[Dict[Any, Any]]:
|
470
|
+
req = GetInternalStateReq()
|
471
|
+
responses: List[GetInternalStateReqOutput] = (
|
472
|
+
await self.get_internal_state_communicator(req)
|
473
|
+
)
|
474
|
+
# Many DP ranks
|
475
|
+
return [res.internal_state for res in responses]
|
476
|
+
|
477
|
+
async def set_internal_state(
|
478
|
+
self: TokenizerManager, obj: SetInternalStateReq
|
479
|
+
) -> List[bool]:
|
480
|
+
responses: List[SetInternalStateReqOutput] = (
|
481
|
+
await self.set_internal_state_communicator(obj)
|
482
|
+
)
|
483
|
+
return [res.updated for res in responses]
|
484
|
+
|
485
|
+
async def get_load(self: TokenizerManager) -> dict:
|
486
|
+
# TODO(lsyin): fake load report server
|
487
|
+
if not self.current_load_lock.locked():
|
488
|
+
async with self.current_load_lock:
|
489
|
+
internal_state = await self.get_internal_state()
|
490
|
+
self.current_load = internal_state[0]["load"]
|
491
|
+
return {"load": self.current_load}
|