sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +24 -3
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +85 -54
- sglang/srt/entrypoints/openai/protocol.py +4 -1
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +36 -16
- sglang/srt/entrypoints/openai/serving_completions.py +12 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +6 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +147 -47
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +64 -40
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +158 -160
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +187 -39
- sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +259 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/hicache_storage.py +3 -23
- sglang/srt/mem_cache/hiradix_cache.py +103 -43
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +105 -46
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +493 -76
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +59 -2
- sglang/srt/model_executor/model_runner.py +356 -29
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +109 -15
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +1 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +56 -12
- sglang/srt/models/qwen3_moe.py +1 -1
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +276 -35
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +43 -4
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +34 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +11 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
- sglang/srt/disaggregation/launch_lb.py +0 -118
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,680 @@
|
|
1
|
+
"""
|
2
|
+
Standalone gRPC Server for SGLang - Fully separated from HTTP server.
|
3
|
+
Uses GrpcRequestManager for orchestration without tokenization.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import argparse
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
import multiprocessing as mp
|
10
|
+
import os
|
11
|
+
import signal
|
12
|
+
import time
|
13
|
+
from concurrent import futures
|
14
|
+
from typing import AsyncIterator, Dict, Optional, Tuple
|
15
|
+
|
16
|
+
import grpc
|
17
|
+
from grpc_reflection.v1alpha import reflection
|
18
|
+
|
19
|
+
from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
|
20
|
+
from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
|
21
|
+
from sglang.srt.managers.data_parallel_controller import (
|
22
|
+
run_data_parallel_controller_process,
|
23
|
+
)
|
24
|
+
from sglang.srt.managers.io_struct import (
|
25
|
+
TokenizedEmbeddingReqInput,
|
26
|
+
TokenizedGenerateReqInput,
|
27
|
+
)
|
28
|
+
from sglang.srt.managers.scheduler import run_scheduler_process
|
29
|
+
from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams
|
30
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
31
|
+
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
32
|
+
from sglang.srt.utils import configure_logger, prepare_model_and_tokenizer
|
33
|
+
from sglang.utils import get_exception_traceback
|
34
|
+
|
35
|
+
logger = logging.getLogger(__name__)
|
36
|
+
HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
37
|
+
|
38
|
+
|
39
|
+
def _launch_scheduler_process_only(
|
40
|
+
server_args: ServerArgs,
|
41
|
+
port_args: Optional[PortArgs] = None,
|
42
|
+
) -> Tuple[Dict, PortArgs, list]:
|
43
|
+
"""
|
44
|
+
Launch only the scheduler process(es) without tokenizer/detokenizer.
|
45
|
+
Returns scheduler info, port args, and list of scheduler processes.
|
46
|
+
"""
|
47
|
+
# Configure global environment
|
48
|
+
configure_logger(server_args)
|
49
|
+
server_args.check_server_args()
|
50
|
+
|
51
|
+
# Allocate ports for inter-process communications
|
52
|
+
if port_args is None:
|
53
|
+
port_args = PortArgs.init_new(server_args)
|
54
|
+
logger.info(f"{server_args=}")
|
55
|
+
|
56
|
+
# Prepare model and tokenizer paths
|
57
|
+
server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
|
58
|
+
server_args.model_path, server_args.tokenizer_path
|
59
|
+
)
|
60
|
+
|
61
|
+
scheduler_procs = []
|
62
|
+
if server_args.dp_size == 1:
|
63
|
+
memory_saver_adapter = TorchMemorySaverAdapter.create(
|
64
|
+
enable=server_args.enable_memory_saver
|
65
|
+
)
|
66
|
+
scheduler_pipe_readers = []
|
67
|
+
|
68
|
+
nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
|
69
|
+
tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
|
70
|
+
tp_rank_range = range(
|
71
|
+
tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
|
72
|
+
tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
|
73
|
+
)
|
74
|
+
|
75
|
+
pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
|
76
|
+
pp_rank_range = range(
|
77
|
+
pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
|
78
|
+
pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
|
79
|
+
)
|
80
|
+
|
81
|
+
for pp_rank in pp_rank_range:
|
82
|
+
for tp_rank in tp_rank_range:
|
83
|
+
reader, writer = mp.Pipe(duplex=False)
|
84
|
+
gpu_id = (
|
85
|
+
server_args.base_gpu_id
|
86
|
+
+ ((pp_rank % pp_size_per_node) * tp_size_per_node)
|
87
|
+
+ (tp_rank % tp_size_per_node) * server_args.gpu_id_step
|
88
|
+
)
|
89
|
+
moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
|
90
|
+
proc = mp.Process(
|
91
|
+
target=run_scheduler_process,
|
92
|
+
args=(
|
93
|
+
server_args,
|
94
|
+
port_args,
|
95
|
+
gpu_id,
|
96
|
+
tp_rank,
|
97
|
+
moe_ep_rank,
|
98
|
+
pp_rank,
|
99
|
+
None,
|
100
|
+
writer,
|
101
|
+
None,
|
102
|
+
),
|
103
|
+
)
|
104
|
+
|
105
|
+
with memory_saver_adapter.configure_subprocess():
|
106
|
+
proc.start()
|
107
|
+
scheduler_procs.append(proc)
|
108
|
+
scheduler_pipe_readers.append(reader)
|
109
|
+
else:
|
110
|
+
# Launch the data parallel controller
|
111
|
+
reader, writer = mp.Pipe(duplex=False)
|
112
|
+
scheduler_pipe_readers = [reader]
|
113
|
+
proc = mp.Process(
|
114
|
+
target=run_data_parallel_controller_process,
|
115
|
+
args=(server_args, port_args, writer),
|
116
|
+
)
|
117
|
+
proc.start()
|
118
|
+
scheduler_procs.append(proc)
|
119
|
+
|
120
|
+
# TODO(CatherineSue): handle cases for multi-node
|
121
|
+
|
122
|
+
# Wait for all scheduler processes to be ready
|
123
|
+
scheduler_infos = []
|
124
|
+
for i, reader in enumerate(scheduler_pipe_readers):
|
125
|
+
try:
|
126
|
+
data = reader.recv()
|
127
|
+
except EOFError:
|
128
|
+
logger.error(
|
129
|
+
f"Rank {i} scheduler is dead. Please check if there are relevant logs."
|
130
|
+
)
|
131
|
+
scheduler_procs[i].join()
|
132
|
+
logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
|
133
|
+
raise RuntimeError(f"Failed to initialize scheduler rank {i}")
|
134
|
+
|
135
|
+
if data.get("status") != "ready":
|
136
|
+
raise RuntimeError(
|
137
|
+
f"Scheduler rank {i} initialization failed: {data.get('error', 'Unknown error')}"
|
138
|
+
)
|
139
|
+
scheduler_infos.append(data)
|
140
|
+
|
141
|
+
logger.info(
|
142
|
+
f"All {len(scheduler_procs)} scheduler process(es) initialized successfully"
|
143
|
+
)
|
144
|
+
|
145
|
+
# Return the first scheduler's info (they should all be the same)
|
146
|
+
return scheduler_infos[0], port_args, scheduler_procs
|
147
|
+
|
148
|
+
|
149
|
+
class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer):
|
150
|
+
"""
|
151
|
+
Standalone gRPC service implementation using GrpcRequestManager.
|
152
|
+
Fully separated from HTTP server with its own process and no shared globals.
|
153
|
+
"""
|
154
|
+
|
155
|
+
def __init__(
|
156
|
+
self,
|
157
|
+
request_manager: GrpcRequestManager,
|
158
|
+
server_args: ServerArgs,
|
159
|
+
model_info: Dict,
|
160
|
+
):
|
161
|
+
"""Initialize the standalone gRPC service."""
|
162
|
+
self.request_manager = request_manager
|
163
|
+
self.server_args = server_args
|
164
|
+
self.model_info = model_info
|
165
|
+
self.start_time = time.time()
|
166
|
+
|
167
|
+
# Start the request manager's event loop using auto_create_handle_loop
|
168
|
+
self.request_manager.auto_create_handle_loop()
|
169
|
+
|
170
|
+
logger.info("Standalone gRPC scheduler service initialized")
|
171
|
+
|
172
|
+
async def Generate(
|
173
|
+
self,
|
174
|
+
request: sglang_scheduler_pb2.GenerateRequest,
|
175
|
+
context: grpc.aio.ServicerContext,
|
176
|
+
) -> AsyncIterator[sglang_scheduler_pb2.GenerateResponse]:
|
177
|
+
"""Handle generation requests with streaming responses."""
|
178
|
+
logger.info(f"Generation request: {request.request_id}")
|
179
|
+
|
180
|
+
try:
|
181
|
+
# Convert gRPC request to internal format
|
182
|
+
tokenized_req = self._convert_generate_request(request)
|
183
|
+
|
184
|
+
# Submit to request manager
|
185
|
+
output_queue = await self.request_manager.generate_request(
|
186
|
+
obj=tokenized_req,
|
187
|
+
request_id=request.request_id,
|
188
|
+
grpc_context=context,
|
189
|
+
)
|
190
|
+
|
191
|
+
# Stream outputs
|
192
|
+
while True:
|
193
|
+
try:
|
194
|
+
# Get output with timeout
|
195
|
+
output = await asyncio.wait_for(output_queue.get(), timeout=4)
|
196
|
+
|
197
|
+
# Check for errors
|
198
|
+
if "error" in output:
|
199
|
+
yield sglang_scheduler_pb2.GenerateResponse(
|
200
|
+
request_id=request.request_id,
|
201
|
+
error=sglang_scheduler_pb2.GenerateError(
|
202
|
+
message=output["error"],
|
203
|
+
http_status_code=(
|
204
|
+
"500" if "abort" not in output else "499"
|
205
|
+
),
|
206
|
+
),
|
207
|
+
)
|
208
|
+
break
|
209
|
+
|
210
|
+
# Check if finished
|
211
|
+
if output.get("finished", False):
|
212
|
+
# Send completion
|
213
|
+
yield self._create_completion_response(
|
214
|
+
request.request_id, output
|
215
|
+
)
|
216
|
+
break
|
217
|
+
else:
|
218
|
+
# Send chunk
|
219
|
+
yield self._create_chunk_response(request.request_id, output)
|
220
|
+
|
221
|
+
except asyncio.TimeoutError:
|
222
|
+
# Check if context is still active
|
223
|
+
if context.cancelled():
|
224
|
+
# Abort the request
|
225
|
+
await self.request_manager.abort_request(request.request_id)
|
226
|
+
break
|
227
|
+
continue
|
228
|
+
|
229
|
+
except Exception as e:
|
230
|
+
logger.error(f"Generate failed: {e}\n{get_exception_traceback()}")
|
231
|
+
yield sglang_scheduler_pb2.GenerateResponse(
|
232
|
+
request_id=request.request_id,
|
233
|
+
error=sglang_scheduler_pb2.GenerateError(
|
234
|
+
message=str(e),
|
235
|
+
http_status_code="500",
|
236
|
+
details=get_exception_traceback(),
|
237
|
+
),
|
238
|
+
)
|
239
|
+
|
240
|
+
async def Embed(
|
241
|
+
self,
|
242
|
+
request: sglang_scheduler_pb2.EmbedRequest,
|
243
|
+
context: grpc.aio.ServicerContext,
|
244
|
+
) -> sglang_scheduler_pb2.EmbedResponse:
|
245
|
+
"""Handle embedding requests."""
|
246
|
+
logger.info(f"Embedding request: {request.request_id}")
|
247
|
+
|
248
|
+
try:
|
249
|
+
# Convert request
|
250
|
+
tokenized_req = self._convert_embed_request(request)
|
251
|
+
|
252
|
+
# Submit to request manager
|
253
|
+
future = await self.request_manager.embedding_request(
|
254
|
+
obj=tokenized_req,
|
255
|
+
request_id=request.request_id,
|
256
|
+
)
|
257
|
+
|
258
|
+
# Wait for result
|
259
|
+
result = await future
|
260
|
+
|
261
|
+
# Create response
|
262
|
+
return sglang_scheduler_pb2.EmbedResponse(
|
263
|
+
request_id=request.request_id,
|
264
|
+
complete=sglang_scheduler_pb2.EmbedComplete(
|
265
|
+
embedding=result["embedding"],
|
266
|
+
prompt_tokens=result.get("prompt_tokens", 0),
|
267
|
+
cached_tokens=0,
|
268
|
+
embedding_dim=len(result["embedding"]),
|
269
|
+
generation_time=time.time() - self.start_time,
|
270
|
+
),
|
271
|
+
)
|
272
|
+
|
273
|
+
except Exception as e:
|
274
|
+
logger.error(f"Embed failed: {e}\n{get_exception_traceback()}")
|
275
|
+
return sglang_scheduler_pb2.EmbedResponse(
|
276
|
+
request_id=request.request_id,
|
277
|
+
error=sglang_scheduler_pb2.EmbedError(
|
278
|
+
message=str(e),
|
279
|
+
code="INTERNAL_ERROR",
|
280
|
+
details=get_exception_traceback(),
|
281
|
+
),
|
282
|
+
)
|
283
|
+
|
284
|
+
async def HealthCheck(
|
285
|
+
self,
|
286
|
+
request: sglang_scheduler_pb2.HealthCheckRequest,
|
287
|
+
context: grpc.aio.ServicerContext,
|
288
|
+
) -> sglang_scheduler_pb2.HealthCheckResponse:
|
289
|
+
"""Health check by generating from client input."""
|
290
|
+
try:
|
291
|
+
# Check if request manager is shutting down
|
292
|
+
if self.request_manager.gracefully_exit:
|
293
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
294
|
+
healthy=False, message="Server shutting down"
|
295
|
+
)
|
296
|
+
|
297
|
+
# Extract tokenized input from request
|
298
|
+
if not request.HasField("tokenized"):
|
299
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
300
|
+
healthy=False, message="Tokenized input required for health check"
|
301
|
+
)
|
302
|
+
|
303
|
+
input_text = request.tokenized.original_text
|
304
|
+
input_ids = list(request.tokenized.input_ids)
|
305
|
+
|
306
|
+
# Create health check request
|
307
|
+
rid = f"HEALTH_CHECK_GRPC_{time.time()}"
|
308
|
+
|
309
|
+
health_request = TokenizedGenerateReqInput(
|
310
|
+
rid=rid,
|
311
|
+
input_text=input_text,
|
312
|
+
input_ids=input_ids,
|
313
|
+
sampling_params=SGLSamplingParams(max_new_tokens=1, temperature=0.0),
|
314
|
+
stream=False,
|
315
|
+
mm_inputs=None,
|
316
|
+
return_logprob=False,
|
317
|
+
logprob_start_len=-1,
|
318
|
+
top_logprobs_num=0,
|
319
|
+
token_ids_logprob=None,
|
320
|
+
)
|
321
|
+
|
322
|
+
logger.info(f"Sending health check request to request manager...")
|
323
|
+
|
324
|
+
# Submit and wait for response
|
325
|
+
output_queue = await self.request_manager.generate_request(
|
326
|
+
health_request, request_id=rid
|
327
|
+
)
|
328
|
+
|
329
|
+
try:
|
330
|
+
# Wait for response with configurable timeout
|
331
|
+
response = await asyncio.wait_for(
|
332
|
+
output_queue.get(), timeout=HEALTH_CHECK_TIMEOUT
|
333
|
+
)
|
334
|
+
|
335
|
+
# Clean up
|
336
|
+
if rid in self.request_manager.rid_to_state:
|
337
|
+
del self.request_manager.rid_to_state[rid]
|
338
|
+
|
339
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
340
|
+
healthy=True, message="Health check passed"
|
341
|
+
)
|
342
|
+
|
343
|
+
except asyncio.TimeoutError:
|
344
|
+
# Clean up on timeout
|
345
|
+
if rid in self.request_manager.rid_to_state:
|
346
|
+
del self.request_manager.rid_to_state[rid]
|
347
|
+
|
348
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
349
|
+
healthy=False, message="Health check timeout"
|
350
|
+
)
|
351
|
+
|
352
|
+
except Exception as e:
|
353
|
+
logger.error(f"Health check failed: {e}")
|
354
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
355
|
+
healthy=False, message=f"Health check error: {str(e)}"
|
356
|
+
)
|
357
|
+
|
358
|
+
async def Abort(
|
359
|
+
self,
|
360
|
+
request: sglang_scheduler_pb2.AbortRequest,
|
361
|
+
context: grpc.aio.ServicerContext,
|
362
|
+
) -> sglang_scheduler_pb2.AbortResponse:
|
363
|
+
"""Abort an ongoing request."""
|
364
|
+
logger.info(f"Aborting request: {request.request_id}")
|
365
|
+
|
366
|
+
try:
|
367
|
+
success = await self.request_manager.abort_request(request.request_id)
|
368
|
+
|
369
|
+
return sglang_scheduler_pb2.AbortResponse(
|
370
|
+
success=success,
|
371
|
+
message=f"Request {request.request_id} {'aborted' if success else 'not found'}",
|
372
|
+
)
|
373
|
+
except Exception as e:
|
374
|
+
logger.error(f"Abort failed: {e}")
|
375
|
+
return sglang_scheduler_pb2.AbortResponse(
|
376
|
+
success=False,
|
377
|
+
message=str(e),
|
378
|
+
)
|
379
|
+
|
380
|
+
# Helper methods for request/response conversion
|
381
|
+
|
382
|
+
def _convert_generate_request(
|
383
|
+
self, grpc_req: sglang_scheduler_pb2.GenerateRequest
|
384
|
+
) -> TokenizedGenerateReqInput:
|
385
|
+
"""Convert gRPC GenerateRequest to internal format."""
|
386
|
+
|
387
|
+
# Extract tokenized input
|
388
|
+
if not grpc_req.HasField("tokenized"):
|
389
|
+
raise ValueError("Tokenized input must be provided")
|
390
|
+
|
391
|
+
input_text = grpc_req.tokenized.original_text
|
392
|
+
input_ids = list(grpc_req.tokenized.input_ids)
|
393
|
+
|
394
|
+
# Convert sampling params
|
395
|
+
sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
|
396
|
+
|
397
|
+
# Create request
|
398
|
+
return TokenizedGenerateReqInput(
|
399
|
+
rid=grpc_req.request_id,
|
400
|
+
input_text=input_text,
|
401
|
+
input_ids=input_ids,
|
402
|
+
mm_inputs=None, # TODO: implement mm support
|
403
|
+
sampling_params=sampling_params,
|
404
|
+
return_logprob=grpc_req.return_logprob,
|
405
|
+
logprob_start_len=grpc_req.logprob_start_len or -1,
|
406
|
+
top_logprobs_num=grpc_req.top_logprobs_num or 0,
|
407
|
+
stream=True, # Always stream for gRPC
|
408
|
+
lora_path=grpc_req.lora_id if grpc_req.lora_id else None,
|
409
|
+
token_ids_logprob=(
|
410
|
+
list(grpc_req.token_ids_logprob) if grpc_req.token_ids_logprob else None
|
411
|
+
),
|
412
|
+
)
|
413
|
+
|
414
|
+
def _convert_embed_request(
|
415
|
+
self, grpc_req: sglang_scheduler_pb2.EmbedRequest
|
416
|
+
) -> TokenizedEmbeddingReqInput:
|
417
|
+
"""Convert gRPC EmbedRequest to internal format."""
|
418
|
+
|
419
|
+
# Extract tokenized input
|
420
|
+
if not grpc_req.HasField("tokenized"):
|
421
|
+
raise ValueError("Tokenized input must be provided")
|
422
|
+
|
423
|
+
input_text = grpc_req.tokenized.original_text
|
424
|
+
input_ids = list(grpc_req.tokenized.input_ids)
|
425
|
+
|
426
|
+
return TokenizedEmbeddingReqInput(
|
427
|
+
rid=grpc_req.request_id,
|
428
|
+
input_text=input_text,
|
429
|
+
input_ids=input_ids,
|
430
|
+
)
|
431
|
+
|
432
|
+
def _convert_sampling_params(
|
433
|
+
self, grpc_params: sglang_scheduler_pb2.SamplingParams
|
434
|
+
) -> SGLSamplingParams:
|
435
|
+
"""Convert gRPC SamplingParams to internal format."""
|
436
|
+
|
437
|
+
# Handle constraint types
|
438
|
+
regex = None
|
439
|
+
json_schema = None
|
440
|
+
ebnf_grammar = None
|
441
|
+
|
442
|
+
if grpc_params.HasField("regex"):
|
443
|
+
regex = grpc_params.regex
|
444
|
+
elif grpc_params.HasField("json_schema"):
|
445
|
+
json_schema = grpc_params.json_schema
|
446
|
+
elif grpc_params.HasField("ebnf_grammar"):
|
447
|
+
ebnf_grammar = grpc_params.ebnf_grammar
|
448
|
+
|
449
|
+
return SGLSamplingParams(
|
450
|
+
temperature=grpc_params.temperature or 1.0,
|
451
|
+
top_p=grpc_params.top_p or 1.0,
|
452
|
+
top_k=grpc_params.top_k or -1,
|
453
|
+
min_p=grpc_params.min_p or 0.0,
|
454
|
+
frequency_penalty=grpc_params.frequency_penalty or 0.0,
|
455
|
+
presence_penalty=grpc_params.presence_penalty or 0.0,
|
456
|
+
repetition_penalty=grpc_params.repetition_penalty or 1.0,
|
457
|
+
max_new_tokens=grpc_params.max_new_tokens or 128,
|
458
|
+
min_new_tokens=grpc_params.min_new_tokens or 0,
|
459
|
+
stop=list(grpc_params.stop) if grpc_params.stop else None,
|
460
|
+
stop_token_ids=(
|
461
|
+
list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else None
|
462
|
+
),
|
463
|
+
skip_special_tokens=grpc_params.skip_special_tokens,
|
464
|
+
spaces_between_special_tokens=grpc_params.spaces_between_special_tokens,
|
465
|
+
regex=regex,
|
466
|
+
json_schema=json_schema,
|
467
|
+
ebnf=ebnf_grammar,
|
468
|
+
n=grpc_params.n or 1,
|
469
|
+
ignore_eos=grpc_params.ignore_eos,
|
470
|
+
)
|
471
|
+
|
472
|
+
def _create_chunk_response(
|
473
|
+
self, request_id: str, output: Dict
|
474
|
+
) -> sglang_scheduler_pb2.GenerateResponse:
|
475
|
+
"""Create a streaming chunk response."""
|
476
|
+
return sglang_scheduler_pb2.GenerateResponse(
|
477
|
+
request_id=request_id,
|
478
|
+
chunk=sglang_scheduler_pb2.GenerateStreamChunk(
|
479
|
+
token_id=output["token_ids"][-1] if output.get("token_ids") else 0,
|
480
|
+
text=output.get("text", ""),
|
481
|
+
prompt_tokens=0,
|
482
|
+
completion_tokens=len(output.get("token_ids", [])),
|
483
|
+
cached_tokens=0,
|
484
|
+
generation_time=time.time() - self.start_time,
|
485
|
+
queue_time=0.0,
|
486
|
+
),
|
487
|
+
)
|
488
|
+
|
489
|
+
def _create_completion_response(
|
490
|
+
self, request_id: str, output: Dict
|
491
|
+
) -> sglang_scheduler_pb2.GenerateResponse:
|
492
|
+
"""Create a completion response."""
|
493
|
+
|
494
|
+
# Determine finish reason
|
495
|
+
finish_reason = sglang_scheduler_pb2.GenerateComplete.STOP
|
496
|
+
meta_info = output.get("meta_info", {})
|
497
|
+
if meta_info.get("finish_reason") == "length":
|
498
|
+
finish_reason = sglang_scheduler_pb2.GenerateComplete.LENGTH
|
499
|
+
elif meta_info.get("finish_reason") == "eos_token":
|
500
|
+
finish_reason = sglang_scheduler_pb2.GenerateComplete.EOS_TOKEN
|
501
|
+
|
502
|
+
return sglang_scheduler_pb2.GenerateResponse(
|
503
|
+
request_id=request_id,
|
504
|
+
complete=sglang_scheduler_pb2.GenerateComplete(
|
505
|
+
output_ids=output.get("token_ids", []),
|
506
|
+
output_text=output.get("text", ""),
|
507
|
+
finish_reason=finish_reason,
|
508
|
+
),
|
509
|
+
)
|
510
|
+
|
511
|
+
async def shutdown(self):
|
512
|
+
"""Shutdown the service."""
|
513
|
+
logger.info("Shutting down gRPC service")
|
514
|
+
|
515
|
+
# Shutdown request manager (handles its own tasks)
|
516
|
+
await self.request_manager.shutdown()
|
517
|
+
|
518
|
+
|
519
|
+
async def serve_grpc(
|
520
|
+
server_args: ServerArgs,
|
521
|
+
model_info: Optional[Dict] = None,
|
522
|
+
):
|
523
|
+
"""Start the standalone gRPC server with integrated scheduler."""
|
524
|
+
|
525
|
+
# Launch only the scheduler process(es) (no tokenizer/detokenizer needed for gRPC)
|
526
|
+
logger.info("Launching scheduler process(es)...")
|
527
|
+
scheduler_info, port_args, scheduler_procs = _launch_scheduler_process_only(
|
528
|
+
server_args=server_args,
|
529
|
+
)
|
530
|
+
|
531
|
+
# Update model info from scheduler info
|
532
|
+
if model_info is None:
|
533
|
+
model_info = {
|
534
|
+
"model_name": server_args.model_path,
|
535
|
+
"max_context_length": scheduler_info.get(
|
536
|
+
"max_total_num_tokens", server_args.context_length or 8192
|
537
|
+
),
|
538
|
+
"vocab_size": scheduler_info.get("vocab_size", 128256),
|
539
|
+
"supports_vision": scheduler_info.get("supports_vision", False),
|
540
|
+
"model_type": scheduler_info.get("model_type", "transformer"),
|
541
|
+
"max_req_input_len": scheduler_info.get("max_req_input_len", 8192),
|
542
|
+
"eos_token_ids": scheduler_info.get("eos_token_ids", []),
|
543
|
+
"pad_token_id": scheduler_info.get("pad_token_id", 0),
|
544
|
+
"bos_token_id": scheduler_info.get("bos_token_id", 1),
|
545
|
+
}
|
546
|
+
|
547
|
+
# Create request manager with the correct port args
|
548
|
+
request_manager = GrpcRequestManager(
|
549
|
+
server_args=server_args,
|
550
|
+
port_args=port_args,
|
551
|
+
)
|
552
|
+
|
553
|
+
# Create gRPC server
|
554
|
+
server = grpc.aio.server(
|
555
|
+
futures.ThreadPoolExecutor(max_workers=10),
|
556
|
+
options=[
|
557
|
+
("grpc.max_send_message_length", 1024 * 1024 * 256),
|
558
|
+
("grpc.max_receive_message_length", 1024 * 1024 * 256),
|
559
|
+
],
|
560
|
+
)
|
561
|
+
|
562
|
+
# Add service
|
563
|
+
servicer = SGLangSchedulerServicer(
|
564
|
+
request_manager=request_manager,
|
565
|
+
server_args=server_args,
|
566
|
+
model_info=model_info,
|
567
|
+
)
|
568
|
+
sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
|
569
|
+
|
570
|
+
# Enable reflection
|
571
|
+
SERVICE_NAMES = (
|
572
|
+
sglang_scheduler_pb2.DESCRIPTOR.services_by_name["SglangScheduler"].full_name,
|
573
|
+
reflection.SERVICE_NAME,
|
574
|
+
)
|
575
|
+
reflection.enable_server_reflection(SERVICE_NAMES, server)
|
576
|
+
|
577
|
+
# Start server
|
578
|
+
listen_addr = f"{server_args.host}:{server_args.port}"
|
579
|
+
server.add_insecure_port(listen_addr)
|
580
|
+
|
581
|
+
logger.info(f"Starting standalone gRPC server on {listen_addr}")
|
582
|
+
|
583
|
+
await server.start()
|
584
|
+
|
585
|
+
# Handle shutdown signals
|
586
|
+
loop = asyncio.get_running_loop()
|
587
|
+
stop_event = asyncio.Event()
|
588
|
+
|
589
|
+
def signal_handler():
|
590
|
+
logger.info("Received shutdown signal")
|
591
|
+
stop_event.set()
|
592
|
+
|
593
|
+
for sig in (signal.SIGTERM, signal.SIGINT):
|
594
|
+
loop.add_signal_handler(sig, signal_handler)
|
595
|
+
|
596
|
+
try:
|
597
|
+
await stop_event.wait()
|
598
|
+
finally:
|
599
|
+
logger.info("Shutting down gRPC server")
|
600
|
+
await servicer.shutdown()
|
601
|
+
await server.stop(5.0)
|
602
|
+
|
603
|
+
# Terminate scheduler processes
|
604
|
+
for i, proc in enumerate(scheduler_procs):
|
605
|
+
if proc and proc.is_alive():
|
606
|
+
logger.info(f"Terminating scheduler process {i}...")
|
607
|
+
proc.terminate()
|
608
|
+
proc.join(timeout=5.0)
|
609
|
+
if proc.is_alive():
|
610
|
+
logger.warning(f"Force killing scheduler process {i}...")
|
611
|
+
proc.kill()
|
612
|
+
proc.join()
|
613
|
+
|
614
|
+
|
615
|
+
def main():
|
616
|
+
"""Main entry point for standalone gRPC server."""
|
617
|
+
# Fix CUDA multiprocessing issues - must be called before any CUDA operations
|
618
|
+
mp.set_start_method("spawn", force=True)
|
619
|
+
|
620
|
+
parser = argparse.ArgumentParser(description="SGLang Standalone gRPC Server")
|
621
|
+
|
622
|
+
# Server arguments
|
623
|
+
parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
|
624
|
+
parser.add_argument("--port", type=int, default=30000, help="gRPC server port")
|
625
|
+
|
626
|
+
# Model arguments
|
627
|
+
parser.add_argument("--model-path", type=str, required=True, help="Model path")
|
628
|
+
parser.add_argument("--tokenizer-path", type=str, help="Tokenizer path")
|
629
|
+
parser.add_argument("--context-length", type=int, help="Context length")
|
630
|
+
parser.add_argument("--tp-size", type=int, default=1, help="Tensor parallel size")
|
631
|
+
parser.add_argument("--dp-size", type=int, default=1, help="Data parallel size")
|
632
|
+
|
633
|
+
# Runtime arguments
|
634
|
+
parser.add_argument(
|
635
|
+
"--max-running-requests", type=int, default=2048, help="Max concurrent requests"
|
636
|
+
)
|
637
|
+
parser.add_argument(
|
638
|
+
"--max-total-tokens", type=int, default=1000000, help="Max total tokens"
|
639
|
+
)
|
640
|
+
parser.add_argument(
|
641
|
+
"--max-prefill-tokens", type=int, default=16384, help="Max prefill tokens"
|
642
|
+
)
|
643
|
+
parser.add_argument(
|
644
|
+
"--attention-backend", type=str, default="flashinfer", help="Attention backend"
|
645
|
+
)
|
646
|
+
parser.add_argument("--lora-paths", type=str, help="LoRA adapter paths")
|
647
|
+
|
648
|
+
# Logging
|
649
|
+
parser.add_argument("--log-level", type=str, default="INFO", help="Logging level")
|
650
|
+
|
651
|
+
args = parser.parse_args()
|
652
|
+
|
653
|
+
# Convert to ServerArgs with gRPC host and port
|
654
|
+
server_args = ServerArgs(
|
655
|
+
model_path=args.model_path,
|
656
|
+
tokenizer_path=args.tokenizer_path or args.model_path,
|
657
|
+
context_length=args.context_length,
|
658
|
+
tp_size=args.tp_size,
|
659
|
+
dp_size=args.dp_size,
|
660
|
+
max_running_requests=args.max_running_requests,
|
661
|
+
max_total_tokens=args.max_total_tokens,
|
662
|
+
max_prefill_tokens=args.max_prefill_tokens,
|
663
|
+
attention_backend=args.attention_backend,
|
664
|
+
lora_paths=args.lora_paths.split(",") if args.lora_paths else None,
|
665
|
+
log_level=args.log_level,
|
666
|
+
# Override with gRPC server host and port
|
667
|
+
host=args.host,
|
668
|
+
port=args.port,
|
669
|
+
)
|
670
|
+
|
671
|
+
# Run server
|
672
|
+
asyncio.run(
|
673
|
+
serve_grpc(
|
674
|
+
server_args=server_args,
|
675
|
+
)
|
676
|
+
)
|
677
|
+
|
678
|
+
|
679
|
+
if __name__ == "__main__":
|
680
|
+
main()
|