sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +24 -3
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +85 -54
- sglang/srt/entrypoints/openai/protocol.py +4 -1
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +36 -16
- sglang/srt/entrypoints/openai/serving_completions.py +12 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +6 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +147 -47
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +64 -40
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +158 -160
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +187 -39
- sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +259 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/hicache_storage.py +3 -23
- sglang/srt/mem_cache/hiradix_cache.py +103 -43
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +105 -46
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +493 -76
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +59 -2
- sglang/srt/model_executor/model_runner.py +356 -29
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +109 -15
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +1 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +56 -12
- sglang/srt/models/qwen3_moe.py +1 -1
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +276 -35
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +43 -4
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +34 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +11 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
- sglang/srt/disaggregation/launch_lb.py +0 -118
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,9 @@
|
|
11
11
|
# See the License for the specific language governing permissions and
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
14
17
|
import logging
|
15
18
|
import math
|
16
19
|
import os
|
@@ -19,16 +22,20 @@ from abc import ABC
|
|
19
22
|
from collections import deque
|
20
23
|
from contextlib import contextmanager
|
21
24
|
from pathlib import Path
|
22
|
-
from typing import Any, Dict, List, Literal, Optional, Tuple, Type
|
25
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
|
23
26
|
|
24
27
|
import einops
|
25
28
|
import torch
|
26
29
|
import torch.distributed
|
27
30
|
|
28
|
-
from sglang.srt.eplb.expert_location import ExpertLocationMetadata
|
29
31
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
30
32
|
from sglang.srt.server_args import ServerArgs
|
31
|
-
from sglang.srt.utils import Withable, get_bool_env_var
|
33
|
+
from sglang.srt.utils import Withable, get_bool_env_var, is_npu
|
34
|
+
|
35
|
+
_is_npu = is_npu()
|
36
|
+
|
37
|
+
if TYPE_CHECKING:
|
38
|
+
from sglang.srt.eplb.expert_location import ExpertLocationMetadata
|
32
39
|
|
33
40
|
logger = logging.getLogger(__name__)
|
34
41
|
|
@@ -43,7 +50,7 @@ class ExpertDistributionRecorder(ABC):
|
|
43
50
|
@staticmethod
|
44
51
|
def init_new(
|
45
52
|
server_args: ServerArgs,
|
46
|
-
expert_location_metadata:
|
53
|
+
expert_location_metadata: ExpertLocationMetadata,
|
47
54
|
rank: int,
|
48
55
|
):
|
49
56
|
if server_args.expert_distribution_recorder_mode is not None:
|
@@ -118,7 +125,7 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
|
|
118
125
|
def __init__(
|
119
126
|
self,
|
120
127
|
server_args: ServerArgs,
|
121
|
-
expert_location_metadata:
|
128
|
+
expert_location_metadata: ExpertLocationMetadata,
|
122
129
|
rank: int,
|
123
130
|
):
|
124
131
|
self._server_args = server_args
|
@@ -211,7 +218,9 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
|
|
211
218
|
def _on_hook(self, hook_name: str, **kwargs):
|
212
219
|
if self._disable_all:
|
213
220
|
return
|
214
|
-
if not (
|
221
|
+
if not (
|
222
|
+
self._recording or torch.get_device_module().is_current_stream_capturing()
|
223
|
+
):
|
215
224
|
return
|
216
225
|
gatherer = self._single_pass_gatherers[
|
217
226
|
self._accumulator.get_single_pass_gatherer_key(
|
@@ -279,7 +288,7 @@ class _SinglePassGatherer(ABC):
|
|
279
288
|
@staticmethod
|
280
289
|
def init_new(
|
281
290
|
server_args: ServerArgs,
|
282
|
-
expert_location_metadata:
|
291
|
+
expert_location_metadata: ExpertLocationMetadata,
|
283
292
|
rank: int,
|
284
293
|
) -> "_SinglePassGatherer":
|
285
294
|
if server_args.expert_distribution_recorder_mode == "per_token":
|
@@ -307,7 +316,7 @@ class _SinglePassGatherer(ABC):
|
|
307
316
|
|
308
317
|
return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
|
309
318
|
|
310
|
-
def __init__(self, expert_location_metadata:
|
319
|
+
def __init__(self, expert_location_metadata: ExpertLocationMetadata, rank: int):
|
311
320
|
self._expert_location_metadata = expert_location_metadata
|
312
321
|
self._rank = rank
|
313
322
|
|
@@ -346,7 +355,7 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
|
|
346
355
|
def __init__(
|
347
356
|
self,
|
348
357
|
server_args: ServerArgs,
|
349
|
-
expert_location_metadata:
|
358
|
+
expert_location_metadata: ExpertLocationMetadata,
|
350
359
|
rank: int,
|
351
360
|
):
|
352
361
|
super().__init__(expert_location_metadata, rank)
|
@@ -446,6 +455,10 @@ def _list_sum(a: List, b: List) -> List:
|
|
446
455
|
class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
|
447
456
|
def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
|
448
457
|
super().__init__(*args, **kwargs)
|
458
|
+
if not _is_npu:
|
459
|
+
device = "cuda"
|
460
|
+
else:
|
461
|
+
device = "npu"
|
449
462
|
self._enable_global_physical_experts = enable_global_physical_experts
|
450
463
|
self._data = torch.zeros(
|
451
464
|
(
|
@@ -457,7 +470,7 @@ class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
|
|
457
470
|
),
|
458
471
|
),
|
459
472
|
dtype=torch.int,
|
460
|
-
device=
|
473
|
+
device=device,
|
461
474
|
)
|
462
475
|
|
463
476
|
def reset(self):
|
@@ -561,7 +574,7 @@ class _Accumulator(ABC):
|
|
561
574
|
@staticmethod
|
562
575
|
def init_new(
|
563
576
|
server_args: ServerArgs,
|
564
|
-
expert_location_metadata:
|
577
|
+
expert_location_metadata: ExpertLocationMetadata,
|
565
578
|
rank: int,
|
566
579
|
) -> "_Accumulator":
|
567
580
|
return _Accumulator.get_class(server_args)(
|
@@ -580,7 +593,7 @@ class _Accumulator(ABC):
|
|
580
593
|
def __init__(
|
581
594
|
self,
|
582
595
|
server_args: ServerArgs,
|
583
|
-
expert_location_metadata:
|
596
|
+
expert_location_metadata: ExpertLocationMetadata,
|
584
597
|
rank: int,
|
585
598
|
):
|
586
599
|
self._server_args = server_args
|
@@ -779,7 +792,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
|
779
792
|
|
780
793
|
if self._first_dump:
|
781
794
|
self._first_dump = False
|
782
|
-
torch.
|
795
|
+
torch.get_device_module().empty_cache()
|
783
796
|
|
784
797
|
torch.distributed.all_reduce(
|
785
798
|
logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
|
@@ -11,21 +11,26 @@
|
|
11
11
|
# See the License for the specific language governing permissions and
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
14
17
|
import json
|
15
18
|
import logging
|
16
19
|
import random
|
17
20
|
from dataclasses import dataclass
|
18
21
|
from pathlib import Path
|
19
|
-
from typing import List, Optional
|
22
|
+
from typing import TYPE_CHECKING, List, Optional
|
20
23
|
|
21
24
|
import torch
|
22
25
|
import torch.distributed
|
23
26
|
import torch.nn.functional as F
|
24
27
|
|
25
|
-
from sglang.srt.configs.model_config import ModelConfig
|
26
28
|
from sglang.srt.eplb import eplb_algorithms
|
27
29
|
from sglang.srt.model_loader import get_model_architecture
|
28
|
-
|
30
|
+
|
31
|
+
if TYPE_CHECKING:
|
32
|
+
from sglang.srt.configs.model_config import ModelConfig
|
33
|
+
from sglang.srt.server_args import ServerArgs
|
29
34
|
|
30
35
|
logger = logging.getLogger(__name__)
|
31
36
|
|
@@ -47,7 +47,7 @@ class ExpertLocationUpdater:
|
|
47
47
|
):
|
48
48
|
if self._first_execution:
|
49
49
|
self._first_execution = False
|
50
|
-
torch.
|
50
|
+
torch.get_device_module().empty_cache()
|
51
51
|
|
52
52
|
old_expert_location_metadata = get_global_expert_location_metadata()
|
53
53
|
assert old_expert_location_metadata is not None
|
@@ -162,12 +162,9 @@ class BaseFormatDetector(ABC):
|
|
162
162
|
|
163
163
|
try:
|
164
164
|
try:
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
self.tool_call_separator + self.bot_token
|
169
|
-
):
|
170
|
-
start_idx = len(self.tool_call_separator + self.bot_token)
|
165
|
+
tool_call_pos = current_text.find(self.bot_token)
|
166
|
+
if tool_call_pos != -1:
|
167
|
+
start_idx = tool_call_pos + len(self.bot_token)
|
171
168
|
elif self.current_tool_id > 0 and current_text.startswith(
|
172
169
|
self.tool_call_separator
|
173
170
|
):
|
@@ -50,19 +50,19 @@ class EBNFComposer:
|
|
50
50
|
|
51
51
|
CALL_RULE_MAP = {
|
52
52
|
"pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
|
53
|
-
"json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ",
|
53
|
+
"json": 'call_{name} ::= "{{" ws "\\"name\\"" ws ":" ws "\\"{name}\\"" ws "," ws "\\"arguments\\"" ws ":" ws {arguments_rule} ws "}}"',
|
54
54
|
"xml": 'call_{name} ::= "<function={name}>\\n" {arguments_rule} "\\n</function>"',
|
55
55
|
}
|
56
56
|
|
57
57
|
ARGUMENTS_RULE_MAP = {
|
58
58
|
"pythonic": "{arg_rules}",
|
59
|
-
"json": '"{{" {arg_rules} "}}"',
|
59
|
+
"json": '"{{" ws {arg_rules} ws "}}"',
|
60
60
|
"xml": "{arg_rules}",
|
61
61
|
}
|
62
62
|
|
63
63
|
KEY_VALUE_RULE_MAP = {
|
64
64
|
"pythonic": '"{key}" "=" {valrule}',
|
65
|
-
"json": '"\\"{key}\\"" ":" {valrule}',
|
65
|
+
"json": '"\\"{key}\\"" ws ":" ws {valrule}',
|
66
66
|
"xml": '"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
|
67
67
|
}
|
68
68
|
|
@@ -165,7 +165,7 @@ class EBNFComposer:
|
|
165
165
|
tool_call_separator: Optional[str] = None,
|
166
166
|
call_rule_fmt: Optional[str] = None,
|
167
167
|
key_value_rule_fmt: Optional[str] = None,
|
168
|
-
key_value_separator: str = ",",
|
168
|
+
key_value_separator: str = 'ws "," ws',
|
169
169
|
):
|
170
170
|
"""
|
171
171
|
Generalized EBNF builder for all detectors.
|
@@ -183,6 +183,10 @@ class EBNFComposer:
|
|
183
183
|
key_value_rule_fmt: Optional custom format string for key-value pairs. It should define how each parameter is formatted,
|
184
184
|
with placeholders {key} for the parameter name and {valrule} for the value rule. If None, a default format
|
185
185
|
based on function_format will be used.
|
186
|
+
key_value_separator: Raw EBNF fragment inserted between key-value pairs.
|
187
|
+
This string is used verbatim (not auto-quoted). Pass:
|
188
|
+
- Quoted terminals when you need a literal token (e.g. '","' or '"\\n"').
|
189
|
+
- Raw/non-terminals when you need grammar tokens (e.g. 'ws "," ws').
|
186
190
|
"""
|
187
191
|
# =================================================================
|
188
192
|
# Step 1: Determine the root tool calls rule
|
@@ -281,9 +285,7 @@ class EBNFComposer:
|
|
281
285
|
# Add required properties joined by commas
|
282
286
|
if required:
|
283
287
|
rule_parts.append(
|
284
|
-
f
|
285
|
-
prop_kv_pairs[k] for k in required
|
286
|
-
)
|
288
|
+
f" {key_value_separator} ".join(prop_kv_pairs[k] for k in required)
|
287
289
|
)
|
288
290
|
|
289
291
|
# Add optional properties with flexible ordering
|
@@ -298,14 +300,14 @@ class EBNFComposer:
|
|
298
300
|
opt_parts.append(prop_kv_pairs[optional[j]])
|
299
301
|
else:
|
300
302
|
opt_parts.append(
|
301
|
-
f
|
303
|
+
f" ( {key_value_separator} {prop_kv_pairs[optional[j]]} )?"
|
302
304
|
)
|
303
305
|
opt_alternatives.append("".join(opt_parts))
|
304
306
|
|
305
307
|
# Wrap with appropriate comma handling based on whether we have required properties
|
306
308
|
if required:
|
307
309
|
# Required properties exist, so optional group needs outer comma
|
308
|
-
rule_parts.append(f
|
310
|
+
rule_parts.append(f" ( {key_value_separator} ( ")
|
309
311
|
rule_parts.append(" | ".join(opt_alternatives))
|
310
312
|
rule_parts.append(" ) )?")
|
311
313
|
else:
|
@@ -69,6 +69,8 @@ class FunctionCallParser:
|
|
69
69
|
Returns:
|
70
70
|
True if the text contains a tool call, False otherwise
|
71
71
|
"""
|
72
|
+
if not self.tools:
|
73
|
+
return False
|
72
74
|
return self.detector.has_tool_call(text)
|
73
75
|
|
74
76
|
def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
|
@@ -83,6 +85,8 @@ class FunctionCallParser:
|
|
83
85
|
- The remaining text after parsing that was not consumed by the detector (can be treated as normal text)
|
84
86
|
- A list of tool calls parsed from the text
|
85
87
|
"""
|
88
|
+
if not self.tools:
|
89
|
+
return full_text, []
|
86
90
|
parsed_result = self.detector.detect_and_parse(full_text, self.tools)
|
87
91
|
tool_call_list = parsed_result.calls
|
88
92
|
if tool_call_list:
|
@@ -102,6 +106,8 @@ class FunctionCallParser:
|
|
102
106
|
- The normal text that should be displayed to the user
|
103
107
|
- A list of tool calls parsed from the chunk
|
104
108
|
"""
|
109
|
+
if not self.tools:
|
110
|
+
return chunk_text, []
|
105
111
|
final_normal_text = ""
|
106
112
|
final_calls = []
|
107
113
|
|
@@ -160,5 +160,5 @@ class Glm4MoeDetector(BaseFormatDetector):
|
|
160
160
|
function_format="xml",
|
161
161
|
call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
|
162
162
|
key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
|
163
|
-
key_value_separator="\\n",
|
163
|
+
key_value_separator='"\\n"',
|
164
164
|
)
|
@@ -358,5 +358,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
|
|
358
358
|
function_format="xml",
|
359
359
|
call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
|
360
360
|
key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
|
361
|
-
key_value_separator="\\n",
|
361
|
+
key_value_separator='"\\n"',
|
362
362
|
)
|
@@ -0,0 +1 @@
|
|
1
|
+
# SGLang gRPC module
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
3
|
+
# NO CHECKED-IN PROTOBUF GENCODE
|
4
|
+
# source: sglang_scheduler.proto
|
5
|
+
# Protobuf Python Version: 6.31.1
|
6
|
+
"""Generated protocol buffer code."""
|
7
|
+
from google.protobuf import descriptor as _descriptor
|
8
|
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
9
|
+
from google.protobuf import runtime_version as _runtime_version
|
10
|
+
from google.protobuf import symbol_database as _symbol_database
|
11
|
+
from google.protobuf.internal import builder as _builder
|
12
|
+
_runtime_version.ValidateProtobufRuntimeVersion(
|
13
|
+
_runtime_version.Domain.PUBLIC,
|
14
|
+
6,
|
15
|
+
31,
|
16
|
+
1,
|
17
|
+
'',
|
18
|
+
'sglang_scheduler.proto'
|
19
|
+
)
|
20
|
+
# @@protoc_insertion_point(imports)
|
21
|
+
|
22
|
+
_sym_db = _symbol_database.Default()
|
23
|
+
|
24
|
+
|
25
|
+
from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2
|
26
|
+
from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
|
27
|
+
|
28
|
+
|
29
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x16sglang_scheduler.proto\x12\x15sglang.grpc.scheduler\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1cgoogle/protobuf/struct.proto\"\xc7\x05\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x16\n\x0emax_new_tokens\x18\x08 \x01(\x05\x12\x0c\n\x04stop\x18\t \x03(\t\x12\x16\n\x0estop_token_ids\x18\n \x03(\x05\x12\x1b\n\x13skip_special_tokens\x18\x0b \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\x0c \x01(\x08\x12\x0f\n\x05regex\x18\r \x01(\tH\x00\x12\x15\n\x0bjson_schema\x18\x0e \x01(\tH\x00\x12\x16\n\x0c\x65\x62nf_grammar\x18\x0f \x01(\tH\x00\x12\x11\n\tlora_path\x18\x10 \x01(\t\x12\t\n\x01n\x18\x11 \x01(\x05\x12\x15\n\rtoken_healing\x18\x12 \x01(\x08\x12\x16\n\x0emin_new_tokens\x18\x13 \x01(\x05\x12\x12\n\nignore_eos\x18\x14 \x01(\x08\x12\x14\n\x0cno_stop_trim\x18\x15 \x01(\x08\x12\x17\n\x0fstream_interval\x18\x16 \x01(\x05\x12H\n\nlogit_bias\x18\x17 \x03(\x0b\x32\x34.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry\x12\x16\n\x0estructural_tag\x18\x18 \x01(\t\x12.\n\rcustom_params\x18\x19 \x01(\x0b\x32\x17.google.protobuf.Struct\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraint\"]\n\x13\x44isaggregatedParams\x12\x16\n\x0e\x62ootstrap_host\x18\x01 \x01(\t\x12\x16\n\x0e\x62ootstrap_port\x18\x02 \x01(\x05\x12\x16\n\x0e\x62ootstrap_room\x18\x03 \x01(\x05\"\xe9\x04\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x04 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x16\n\x0ereturn_logprob\x18\x05 \x01(\x08\x12\x19\n\x11logprob_start_len\x18\x06 \x01(\x05\x12\x18\n\x10top_logprobs_num\x18\x07 \x01(\x05\x12\x19\n\x11token_ids_logprob\x18\x08 \x03(\x05\x12\x1c\n\x14return_hidden_states\x18\t \x01(\x08\x12H\n\x14\x64isaggregated_params\x18\n \x01(\x0b\x32*.sglang.grpc.scheduler.DisaggregatedParams\x12\x1e\n\x16\x63ustom_logit_processor\x18\x0b \x01(\t\x12-\n\ttimestamp\x18\x0c \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x13\n\x0blog_metrics\x18\r \x01(\x08\x12\x14\n\x0cinput_embeds\x18\x0e \x03(\x02\x12\x0f\n\x07lora_id\x18\x0f \x01(\t\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x10 \x01(\x05\x12\x15\n\rdp_balance_id\x18\x11 \x01(\x05\":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\x05\"\xd3\x01\n\x10MultimodalInputs\x12\x12\n\nimage_urls\x18\x01 \x03(\t\x12\x12\n\nvideo_urls\x18\x02 \x03(\t\x12\x12\n\naudio_urls\x18\x03 \x03(\t\x12\x33\n\x12processed_features\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x12\n\nimage_data\x18\x05 \x03(\x0c\x12\x12\n\nvideo_data\x18\x06 \x03(\x0c\x12\x12\n\naudio_data\x18\x07 \x03(\x0c\x12\x12\n\nmodalities\x18\x08 \x03(\t\"\xe3\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12;\n\x05\x63hunk\x18\x02 \x01(\x0b\x32*.sglang.grpc.scheduler.GenerateStreamChunkH\x00\x12;\n\x08\x63omplete\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.GenerateCompleteH\x00\x12\x35\n\x05\x65rror\x18\x04 \x01(\x0b\x32$.sglang.grpc.scheduler.GenerateErrorH\x00\x42\n\n\x08response\"\xf5\x01\n\x13GenerateStreamChunk\x12\x10\n\x08token_id\x18\x01 \x01(\x05\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05\x12\x31\n\x08logprobs\x18\x06 \x01(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12\x15\n\rhidden_states\x18\x07 \x03(\x02\x12\x17\n\x0fgeneration_time\x18\x08 \x01(\x02\x12\x12\n\nqueue_time\x18\t \x01(\x05\"\xcd\x02\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\x05\x12\x13\n\x0boutput_text\x18\x02 \x01(\t\x12K\n\rfinish_reason\x18\x03 \x01(\x0e\x32\x34.sglang.grpc.scheduler.GenerateComplete.FinishReason\x12\x35\n\x0c\x61ll_logprobs\x18\x0b \x03(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12>\n\x11\x61ll_hidden_states\x18\x0c \x03(\x0b\x32#.sglang.grpc.scheduler.HiddenStates\"L\n\x0c\x46inishReason\x12\x08\n\x04STOP\x10\x00\x12\n\n\x06LENGTH\x10\x01\x12\r\n\tEOS_TOKEN\x10\x02\x12\x0c\n\x08STOP_STR\x10\x03\x12\t\n\x05\x41\x42ORT\x10\x04\"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"\x84\x01\n\x08LogProbs\x12\x16\n\x0etoken_logprobs\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x38\n\x0ctop_logprobs\x18\x03 \x03(\x0b\x32\".sglang.grpc.scheduler.TopLogProbs\x12\x13\n\x0btoken_texts\x18\x04 \x03(\t\"E\n\x0bTopLogProbs\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x13\n\x0btoken_texts\x18\x03 \x03(\t\"?\n\x0cHiddenStates\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05layer\x18\x02 \x01(\x05\x12\x10\n\x08position\x18\x03 \x01(\x05\"\xca\x02\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x04 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x05 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x13\n\x0blog_metrics\x18\x06 \x01(\x08\x12\x16\n\x0etoken_type_ids\x18\x07 \x03(\x05\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x08 \x01(\x05\x12\x18\n\x10is_cross_encoder\x18\t \x01(\x08\x12\r\n\x05texts\x18\n \x03(\t\"\x9d\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\x08\x63omplete\x18\x02 \x01(\x0b\x32$.sglang.grpc.scheduler.EmbedCompleteH\x00\x12\x32\n\x05\x65rror\x18\x03 \x01(\x0b\x32!.sglang.grpc.scheduler.EmbedErrorH\x00\x42\n\n\x08response\"\xbc\x01\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rcached_tokens\x18\x03 \x01(\x05\x12\x15\n\rembedding_dim\x18\x04 \x01(\x05\x12\x17\n\x0fgeneration_time\x18\x05 \x01(\x02\x12:\n\x10\x62\x61tch_embeddings\x18\x06 \x03(\x0b\x32 .sglang.grpc.scheduler.Embedding\"*\n\tEmbedding\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05index\x18\x02 \x01(\x05\"<\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"N\n\x12HealthCheckRequest\x12\x38\n\ttokenized\x18\x01 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t\"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"I\n\x0fLoadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\x12\x14\n\x0c\x61\x64\x61pter_path\x18\x02 \x01(\t\x12\x0c\n\x04rank\x18\x03 \x01(\x05\"H\n\x10LoadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nadapter_id\x18\x02 \x01(\t\x12\x0f\n\x07message\x18\x03 \x01(\t\"\'\n\x11UnloadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\"6\n\x12UnloadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"w\n\x14UpdateWeightsRequest\x12\x13\n\tdisk_path\x18\x01 \x01(\tH\x00\x12\x15\n\x0btensor_data\x18\x02 \x01(\x0cH\x00\x12\x14\n\nremote_url\x18\x03 \x01(\tH\x00\x12\x13\n\x0bweight_name\x18\x04 \x01(\tB\x08\n\x06source\"9\n\x15UpdateWeightsResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"-\n\x17GetInternalStateRequest\x12\x12\n\nstate_keys\x18\x01 \x03(\t\"B\n\x18GetInternalStateResponse\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"A\n\x17SetInternalStateRequest\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"<\n\x18SetInternalStateResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t2\xfe\x02\n\x0fSglangScheduler\x12]\n\x08Generate\x12&.sglang.grpc.scheduler.GenerateRequest\x1a\'.sglang.grpc.scheduler.GenerateResponse0\x01\x12R\n\x05\x45mbed\x12#.sglang.grpc.scheduler.EmbedRequest\x1a$.sglang.grpc.scheduler.EmbedResponse\x12\x64\n\x0bHealthCheck\x12).sglang.grpc.scheduler.HealthCheckRequest\x1a*.sglang.grpc.scheduler.HealthCheckResponse\x12R\n\x05\x41\x62ort\x12#.sglang.grpc.scheduler.AbortRequest\x1a$.sglang.grpc.scheduler.AbortResponseb\x06proto3')
|
30
|
+
|
31
|
+
_globals = globals()
|
32
|
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
33
|
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sglang_scheduler_pb2', _globals)
|
34
|
+
if not _descriptor._USE_C_DESCRIPTORS:
|
35
|
+
DESCRIPTOR._loaded_options = None
|
36
|
+
_globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._loaded_options = None
|
37
|
+
_globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_options = b'8\001'
|
38
|
+
_globals['_SAMPLINGPARAMS']._serialized_start=113
|
39
|
+
_globals['_SAMPLINGPARAMS']._serialized_end=824
|
40
|
+
_globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_start=762
|
41
|
+
_globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_end=810
|
42
|
+
_globals['_DISAGGREGATEDPARAMS']._serialized_start=826
|
43
|
+
_globals['_DISAGGREGATEDPARAMS']._serialized_end=919
|
44
|
+
_globals['_GENERATEREQUEST']._serialized_start=922
|
45
|
+
_globals['_GENERATEREQUEST']._serialized_end=1539
|
46
|
+
_globals['_TOKENIZEDINPUT']._serialized_start=1541
|
47
|
+
_globals['_TOKENIZEDINPUT']._serialized_end=1599
|
48
|
+
_globals['_MULTIMODALINPUTS']._serialized_start=1602
|
49
|
+
_globals['_MULTIMODALINPUTS']._serialized_end=1813
|
50
|
+
_globals['_GENERATERESPONSE']._serialized_start=1816
|
51
|
+
_globals['_GENERATERESPONSE']._serialized_end=2043
|
52
|
+
_globals['_GENERATESTREAMCHUNK']._serialized_start=2046
|
53
|
+
_globals['_GENERATESTREAMCHUNK']._serialized_end=2291
|
54
|
+
_globals['_GENERATECOMPLETE']._serialized_start=2294
|
55
|
+
_globals['_GENERATECOMPLETE']._serialized_end=2627
|
56
|
+
_globals['_GENERATECOMPLETE_FINISHREASON']._serialized_start=2551
|
57
|
+
_globals['_GENERATECOMPLETE_FINISHREASON']._serialized_end=2627
|
58
|
+
_globals['_GENERATEERROR']._serialized_start=2629
|
59
|
+
_globals['_GENERATEERROR']._serialized_end=2704
|
60
|
+
_globals['_LOGPROBS']._serialized_start=2707
|
61
|
+
_globals['_LOGPROBS']._serialized_end=2839
|
62
|
+
_globals['_TOPLOGPROBS']._serialized_start=2841
|
63
|
+
_globals['_TOPLOGPROBS']._serialized_end=2910
|
64
|
+
_globals['_HIDDENSTATES']._serialized_start=2912
|
65
|
+
_globals['_HIDDENSTATES']._serialized_end=2975
|
66
|
+
_globals['_EMBEDREQUEST']._serialized_start=2978
|
67
|
+
_globals['_EMBEDREQUEST']._serialized_end=3308
|
68
|
+
_globals['_EMBEDRESPONSE']._serialized_start=3311
|
69
|
+
_globals['_EMBEDRESPONSE']._serialized_end=3468
|
70
|
+
_globals['_EMBEDCOMPLETE']._serialized_start=3471
|
71
|
+
_globals['_EMBEDCOMPLETE']._serialized_end=3659
|
72
|
+
_globals['_EMBEDDING']._serialized_start=3661
|
73
|
+
_globals['_EMBEDDING']._serialized_end=3703
|
74
|
+
_globals['_EMBEDERROR']._serialized_start=3705
|
75
|
+
_globals['_EMBEDERROR']._serialized_end=3765
|
76
|
+
_globals['_HEALTHCHECKREQUEST']._serialized_start=3767
|
77
|
+
_globals['_HEALTHCHECKREQUEST']._serialized_end=3845
|
78
|
+
_globals['_HEALTHCHECKRESPONSE']._serialized_start=3847
|
79
|
+
_globals['_HEALTHCHECKRESPONSE']._serialized_end=3902
|
80
|
+
_globals['_ABORTREQUEST']._serialized_start=3904
|
81
|
+
_globals['_ABORTREQUEST']._serialized_end=3954
|
82
|
+
_globals['_ABORTRESPONSE']._serialized_start=3956
|
83
|
+
_globals['_ABORTRESPONSE']._serialized_end=4005
|
84
|
+
_globals['_LOADLORAREQUEST']._serialized_start=4007
|
85
|
+
_globals['_LOADLORAREQUEST']._serialized_end=4080
|
86
|
+
_globals['_LOADLORARESPONSE']._serialized_start=4082
|
87
|
+
_globals['_LOADLORARESPONSE']._serialized_end=4154
|
88
|
+
_globals['_UNLOADLORAREQUEST']._serialized_start=4156
|
89
|
+
_globals['_UNLOADLORAREQUEST']._serialized_end=4195
|
90
|
+
_globals['_UNLOADLORARESPONSE']._serialized_start=4197
|
91
|
+
_globals['_UNLOADLORARESPONSE']._serialized_end=4251
|
92
|
+
_globals['_UPDATEWEIGHTSREQUEST']._serialized_start=4253
|
93
|
+
_globals['_UPDATEWEIGHTSREQUEST']._serialized_end=4372
|
94
|
+
_globals['_UPDATEWEIGHTSRESPONSE']._serialized_start=4374
|
95
|
+
_globals['_UPDATEWEIGHTSRESPONSE']._serialized_end=4431
|
96
|
+
_globals['_GETINTERNALSTATEREQUEST']._serialized_start=4433
|
97
|
+
_globals['_GETINTERNALSTATEREQUEST']._serialized_end=4478
|
98
|
+
_globals['_GETINTERNALSTATERESPONSE']._serialized_start=4480
|
99
|
+
_globals['_GETINTERNALSTATERESPONSE']._serialized_end=4546
|
100
|
+
_globals['_SETINTERNALSTATEREQUEST']._serialized_start=4548
|
101
|
+
_globals['_SETINTERNALSTATEREQUEST']._serialized_end=4613
|
102
|
+
_globals['_SETINTERNALSTATERESPONSE']._serialized_start=4615
|
103
|
+
_globals['_SETINTERNALSTATERESPONSE']._serialized_end=4675
|
104
|
+
_globals['_SGLANGSCHEDULER']._serialized_start=4678
|
105
|
+
_globals['_SGLANGSCHEDULER']._serialized_end=5060
|
106
|
+
# @@protoc_insertion_point(module_scope)
|