sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/device_config.py +3 -1
  5. sglang/srt/configs/dots_vlm.py +139 -0
  6. sglang/srt/configs/load_config.py +1 -0
  7. sglang/srt/configs/model_config.py +50 -6
  8. sglang/srt/configs/qwen3_next.py +326 -0
  9. sglang/srt/connector/__init__.py +8 -1
  10. sglang/srt/connector/remote_instance.py +82 -0
  11. sglang/srt/constrained/base_grammar_backend.py +48 -12
  12. sglang/srt/constrained/llguidance_backend.py +0 -1
  13. sglang/srt/constrained/outlines_backend.py +0 -1
  14. sglang/srt/constrained/xgrammar_backend.py +28 -9
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/base/conn.py +1 -1
  21. sglang/srt/disaggregation/common/conn.py +15 -12
  22. sglang/srt/disaggregation/decode.py +21 -10
  23. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -445
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +5 -3
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +24 -3
  31. sglang/srt/entrypoints/engine.py +38 -17
  32. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  33. sglang/srt/entrypoints/grpc_server.py +680 -0
  34. sglang/srt/entrypoints/http_server.py +85 -54
  35. sglang/srt/entrypoints/openai/protocol.py +4 -1
  36. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  37. sglang/srt/entrypoints/openai/serving_chat.py +36 -16
  38. sglang/srt/entrypoints/openai/serving_completions.py +12 -3
  39. sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
  40. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  41. sglang/srt/entrypoints/openai/serving_responses.py +6 -3
  42. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  43. sglang/srt/eplb/eplb_manager.py +2 -2
  44. sglang/srt/eplb/expert_distribution.py +26 -13
  45. sglang/srt/eplb/expert_location.py +8 -3
  46. sglang/srt/eplb/expert_location_updater.py +1 -1
  47. sglang/srt/function_call/base_format_detector.py +3 -6
  48. sglang/srt/function_call/ebnf_composer.py +11 -9
  49. sglang/srt/function_call/function_call_parser.py +6 -0
  50. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  51. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  52. sglang/srt/grpc/__init__.py +1 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  56. sglang/srt/hf_transformers_utils.py +4 -0
  57. sglang/srt/layers/activation.py +142 -9
  58. sglang/srt/layers/attention/ascend_backend.py +11 -4
  59. sglang/srt/layers/attention/fla/chunk.py +242 -0
  60. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  61. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  62. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  63. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  66. sglang/srt/layers/attention/fla/index.py +37 -0
  67. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  68. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  69. sglang/srt/layers/attention/fla/op.py +66 -0
  70. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  71. sglang/srt/layers/attention/fla/utils.py +331 -0
  72. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  73. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  74. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  75. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  76. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  77. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  78. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  79. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  80. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  81. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  82. sglang/srt/layers/attention/triton_backend.py +18 -1
  83. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  84. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  85. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  86. sglang/srt/layers/dp_attention.py +30 -1
  87. sglang/srt/layers/layernorm.py +32 -15
  88. sglang/srt/layers/linear.py +34 -3
  89. sglang/srt/layers/logits_processor.py +29 -10
  90. sglang/srt/layers/moe/__init__.py +2 -1
  91. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  92. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  93. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  94. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  95. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  96. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  100. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  107. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  108. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  109. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  110. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  111. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  112. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  113. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  114. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  115. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  116. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  117. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  118. sglang/srt/layers/moe/topk.py +30 -9
  119. sglang/srt/layers/moe/utils.py +12 -6
  120. sglang/srt/layers/quantization/awq.py +19 -7
  121. sglang/srt/layers/quantization/base_config.py +11 -6
  122. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  123. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  124. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  125. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  126. sglang/srt/layers/quantization/fp8.py +76 -47
  127. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  128. sglang/srt/layers/quantization/gptq.py +25 -17
  129. sglang/srt/layers/quantization/modelopt_quant.py +147 -47
  130. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  131. sglang/srt/layers/quantization/mxfp4.py +64 -40
  132. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  133. sglang/srt/layers/quantization/unquant.py +135 -47
  134. sglang/srt/layers/quantization/w4afp8.py +30 -17
  135. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  136. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  137. sglang/srt/layers/sampler.py +162 -18
  138. sglang/srt/lora/backend/base_backend.py +50 -8
  139. sglang/srt/lora/backend/triton_backend.py +90 -2
  140. sglang/srt/lora/layers.py +32 -0
  141. sglang/srt/lora/lora.py +4 -1
  142. sglang/srt/lora/lora_manager.py +35 -112
  143. sglang/srt/lora/mem_pool.py +24 -10
  144. sglang/srt/lora/utils.py +18 -9
  145. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  146. sglang/srt/managers/cache_controller.py +158 -160
  147. sglang/srt/managers/data_parallel_controller.py +105 -35
  148. sglang/srt/managers/detokenizer_manager.py +8 -4
  149. sglang/srt/managers/disagg_service.py +46 -0
  150. sglang/srt/managers/io_struct.py +199 -12
  151. sglang/srt/managers/mm_utils.py +1 -0
  152. sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
  153. sglang/srt/managers/schedule_batch.py +77 -56
  154. sglang/srt/managers/schedule_policy.py +1 -1
  155. sglang/srt/managers/scheduler.py +187 -39
  156. sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
  157. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  158. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  159. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  160. sglang/srt/managers/tokenizer_manager.py +259 -519
  161. sglang/srt/managers/tp_worker.py +53 -4
  162. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  163. sglang/srt/mem_cache/hicache_storage.py +3 -23
  164. sglang/srt/mem_cache/hiradix_cache.py +103 -43
  165. sglang/srt/mem_cache/memory_pool.py +347 -48
  166. sglang/srt/mem_cache/memory_pool_host.py +105 -46
  167. sglang/srt/mem_cache/radix_cache.py +0 -2
  168. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  169. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  170. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
  171. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  172. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  173. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
  174. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  175. sglang/srt/metrics/collector.py +493 -76
  176. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  177. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  178. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  179. sglang/srt/model_executor/forward_batch_info.py +59 -2
  180. sglang/srt/model_executor/model_runner.py +356 -29
  181. sglang/srt/model_loader/__init__.py +9 -3
  182. sglang/srt/model_loader/loader.py +128 -4
  183. sglang/srt/model_loader/weight_utils.py +2 -1
  184. sglang/srt/models/apertus.py +686 -0
  185. sglang/srt/models/bailing_moe.py +798 -218
  186. sglang/srt/models/bailing_moe_nextn.py +168 -0
  187. sglang/srt/models/deepseek_v2.py +109 -15
  188. sglang/srt/models/dots_vlm.py +174 -0
  189. sglang/srt/models/dots_vlm_vit.py +337 -0
  190. sglang/srt/models/ernie4.py +1 -1
  191. sglang/srt/models/gemma3n_mm.py +1 -1
  192. sglang/srt/models/glm4_moe.py +1 -1
  193. sglang/srt/models/glm4v.py +4 -2
  194. sglang/srt/models/glm4v_moe.py +3 -0
  195. sglang/srt/models/gpt_oss.py +1 -1
  196. sglang/srt/models/llama4.py +9 -0
  197. sglang/srt/models/llama_eagle3.py +13 -0
  198. sglang/srt/models/longcat_flash.py +2 -2
  199. sglang/srt/models/mllama4.py +25 -0
  200. sglang/srt/models/opt.py +637 -0
  201. sglang/srt/models/qwen2.py +7 -0
  202. sglang/srt/models/qwen2_5_vl.py +27 -3
  203. sglang/srt/models/qwen2_moe.py +56 -12
  204. sglang/srt/models/qwen3_moe.py +1 -1
  205. sglang/srt/models/qwen3_next.py +1042 -0
  206. sglang/srt/models/qwen3_next_mtp.py +112 -0
  207. sglang/srt/models/step3_vl.py +1 -1
  208. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  209. sglang/srt/multimodal/processors/glm4v.py +9 -9
  210. sglang/srt/multimodal/processors/internvl.py +141 -129
  211. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  212. sglang/srt/offloader.py +27 -3
  213. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  214. sglang/srt/sampling/sampling_batch_info.py +18 -15
  215. sglang/srt/server_args.py +276 -35
  216. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  217. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  218. sglang/srt/speculative/eagle_utils.py +0 -2
  219. sglang/srt/speculative/eagle_worker.py +43 -4
  220. sglang/srt/speculative/spec_info.py +5 -0
  221. sglang/srt/speculative/standalone_worker.py +109 -0
  222. sglang/srt/tracing/trace.py +552 -0
  223. sglang/srt/utils.py +34 -3
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  226. sglang/test/runners.py +4 -0
  227. sglang/test/test_cutlass_moe.py +24 -6
  228. sglang/test/test_disaggregation_utils.py +66 -0
  229. sglang/test/test_fp4_moe.py +370 -1
  230. sglang/test/test_utils.py +28 -1
  231. sglang/utils.py +11 -0
  232. sglang/version.py +1 -1
  233. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  234. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
  235. sglang/srt/disaggregation/launch_lb.py +0 -118
  236. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  237. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  238. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,9 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
+
15
+ from __future__ import annotations
16
+
14
17
  import logging
15
18
  import math
16
19
  import os
@@ -19,16 +22,20 @@ from abc import ABC
19
22
  from collections import deque
20
23
  from contextlib import contextmanager
21
24
  from pathlib import Path
22
- from typing import Any, Dict, List, Literal, Optional, Tuple, Type
25
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
23
26
 
24
27
  import einops
25
28
  import torch
26
29
  import torch.distributed
27
30
 
28
- from sglang.srt.eplb.expert_location import ExpertLocationMetadata
29
31
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
30
32
  from sglang.srt.server_args import ServerArgs
31
- from sglang.srt.utils import Withable, get_bool_env_var
33
+ from sglang.srt.utils import Withable, get_bool_env_var, is_npu
34
+
35
+ _is_npu = is_npu()
36
+
37
+ if TYPE_CHECKING:
38
+ from sglang.srt.eplb.expert_location import ExpertLocationMetadata
32
39
 
33
40
  logger = logging.getLogger(__name__)
34
41
 
@@ -43,7 +50,7 @@ class ExpertDistributionRecorder(ABC):
43
50
  @staticmethod
44
51
  def init_new(
45
52
  server_args: ServerArgs,
46
- expert_location_metadata: "ExpertLocationMetadata",
53
+ expert_location_metadata: ExpertLocationMetadata,
47
54
  rank: int,
48
55
  ):
49
56
  if server_args.expert_distribution_recorder_mode is not None:
@@ -118,7 +125,7 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
118
125
  def __init__(
119
126
  self,
120
127
  server_args: ServerArgs,
121
- expert_location_metadata: "ExpertLocationMetadata",
128
+ expert_location_metadata: ExpertLocationMetadata,
122
129
  rank: int,
123
130
  ):
124
131
  self._server_args = server_args
@@ -211,7 +218,9 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
211
218
  def _on_hook(self, hook_name: str, **kwargs):
212
219
  if self._disable_all:
213
220
  return
214
- if not (self._recording or torch.cuda.is_current_stream_capturing()):
221
+ if not (
222
+ self._recording or torch.get_device_module().is_current_stream_capturing()
223
+ ):
215
224
  return
216
225
  gatherer = self._single_pass_gatherers[
217
226
  self._accumulator.get_single_pass_gatherer_key(
@@ -279,7 +288,7 @@ class _SinglePassGatherer(ABC):
279
288
  @staticmethod
280
289
  def init_new(
281
290
  server_args: ServerArgs,
282
- expert_location_metadata: "ExpertLocationMetadata",
291
+ expert_location_metadata: ExpertLocationMetadata,
283
292
  rank: int,
284
293
  ) -> "_SinglePassGatherer":
285
294
  if server_args.expert_distribution_recorder_mode == "per_token":
@@ -307,7 +316,7 @@ class _SinglePassGatherer(ABC):
307
316
 
308
317
  return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
309
318
 
310
- def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
319
+ def __init__(self, expert_location_metadata: ExpertLocationMetadata, rank: int):
311
320
  self._expert_location_metadata = expert_location_metadata
312
321
  self._rank = rank
313
322
 
@@ -346,7 +355,7 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
346
355
  def __init__(
347
356
  self,
348
357
  server_args: ServerArgs,
349
- expert_location_metadata: "ExpertLocationMetadata",
358
+ expert_location_metadata: ExpertLocationMetadata,
350
359
  rank: int,
351
360
  ):
352
361
  super().__init__(expert_location_metadata, rank)
@@ -446,6 +455,10 @@ def _list_sum(a: List, b: List) -> List:
446
455
  class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
447
456
  def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
448
457
  super().__init__(*args, **kwargs)
458
+ if not _is_npu:
459
+ device = "cuda"
460
+ else:
461
+ device = "npu"
449
462
  self._enable_global_physical_experts = enable_global_physical_experts
450
463
  self._data = torch.zeros(
451
464
  (
@@ -457,7 +470,7 @@ class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
457
470
  ),
458
471
  ),
459
472
  dtype=torch.int,
460
- device="cuda",
473
+ device=device,
461
474
  )
462
475
 
463
476
  def reset(self):
@@ -561,7 +574,7 @@ class _Accumulator(ABC):
561
574
  @staticmethod
562
575
  def init_new(
563
576
  server_args: ServerArgs,
564
- expert_location_metadata: "ExpertLocationMetadata",
577
+ expert_location_metadata: ExpertLocationMetadata,
565
578
  rank: int,
566
579
  ) -> "_Accumulator":
567
580
  return _Accumulator.get_class(server_args)(
@@ -580,7 +593,7 @@ class _Accumulator(ABC):
580
593
  def __init__(
581
594
  self,
582
595
  server_args: ServerArgs,
583
- expert_location_metadata: "ExpertLocationMetadata",
596
+ expert_location_metadata: ExpertLocationMetadata,
584
597
  rank: int,
585
598
  ):
586
599
  self._server_args = server_args
@@ -779,7 +792,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
779
792
 
780
793
  if self._first_dump:
781
794
  self._first_dump = False
782
- torch.cuda.empty_cache()
795
+ torch.get_device_module().empty_cache()
783
796
 
784
797
  torch.distributed.all_reduce(
785
798
  logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
@@ -11,21 +11,26 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
+
15
+ from __future__ import annotations
16
+
14
17
  import json
15
18
  import logging
16
19
  import random
17
20
  from dataclasses import dataclass
18
21
  from pathlib import Path
19
- from typing import List, Optional
22
+ from typing import TYPE_CHECKING, List, Optional
20
23
 
21
24
  import torch
22
25
  import torch.distributed
23
26
  import torch.nn.functional as F
24
27
 
25
- from sglang.srt.configs.model_config import ModelConfig
26
28
  from sglang.srt.eplb import eplb_algorithms
27
29
  from sglang.srt.model_loader import get_model_architecture
28
- from sglang.srt.server_args import ServerArgs
30
+
31
+ if TYPE_CHECKING:
32
+ from sglang.srt.configs.model_config import ModelConfig
33
+ from sglang.srt.server_args import ServerArgs
29
34
 
30
35
  logger = logging.getLogger(__name__)
31
36
 
@@ -47,7 +47,7 @@ class ExpertLocationUpdater:
47
47
  ):
48
48
  if self._first_execution:
49
49
  self._first_execution = False
50
- torch.cuda.empty_cache()
50
+ torch.get_device_module().empty_cache()
51
51
 
52
52
  old_expert_location_metadata = get_global_expert_location_metadata()
53
53
  assert old_expert_location_metadata is not None
@@ -162,12 +162,9 @@ class BaseFormatDetector(ABC):
162
162
 
163
163
  try:
164
164
  try:
165
- if current_text.startswith(self.bot_token):
166
- start_idx = len(self.bot_token)
167
- elif self.current_tool_id > 0 and current_text.startswith(
168
- self.tool_call_separator + self.bot_token
169
- ):
170
- start_idx = len(self.tool_call_separator + self.bot_token)
165
+ tool_call_pos = current_text.find(self.bot_token)
166
+ if tool_call_pos != -1:
167
+ start_idx = tool_call_pos + len(self.bot_token)
171
168
  elif self.current_tool_id > 0 and current_text.startswith(
172
169
  self.tool_call_separator
173
170
  ):
@@ -50,19 +50,19 @@ class EBNFComposer:
50
50
 
51
51
  CALL_RULE_MAP = {
52
52
  "pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
53
- "json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ", " "\\"arguments\\"" ":" {arguments_rule} "}}"',
53
+ "json": 'call_{name} ::= "{{" ws "\\"name\\"" ws ":" ws "\\"{name}\\"" ws "," ws "\\"arguments\\"" ws ":" ws {arguments_rule} ws "}}"',
54
54
  "xml": 'call_{name} ::= "<function={name}>\\n" {arguments_rule} "\\n</function>"',
55
55
  }
56
56
 
57
57
  ARGUMENTS_RULE_MAP = {
58
58
  "pythonic": "{arg_rules}",
59
- "json": '"{{" {arg_rules} "}}"',
59
+ "json": '"{{" ws {arg_rules} ws "}}"',
60
60
  "xml": "{arg_rules}",
61
61
  }
62
62
 
63
63
  KEY_VALUE_RULE_MAP = {
64
64
  "pythonic": '"{key}" "=" {valrule}',
65
- "json": '"\\"{key}\\"" ":" {valrule}',
65
+ "json": '"\\"{key}\\"" ws ":" ws {valrule}',
66
66
  "xml": '"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
67
67
  }
68
68
 
@@ -165,7 +165,7 @@ class EBNFComposer:
165
165
  tool_call_separator: Optional[str] = None,
166
166
  call_rule_fmt: Optional[str] = None,
167
167
  key_value_rule_fmt: Optional[str] = None,
168
- key_value_separator: str = ",",
168
+ key_value_separator: str = 'ws "," ws',
169
169
  ):
170
170
  """
171
171
  Generalized EBNF builder for all detectors.
@@ -183,6 +183,10 @@ class EBNFComposer:
183
183
  key_value_rule_fmt: Optional custom format string for key-value pairs. It should define how each parameter is formatted,
184
184
  with placeholders {key} for the parameter name and {valrule} for the value rule. If None, a default format
185
185
  based on function_format will be used.
186
+ key_value_separator: Raw EBNF fragment inserted between key-value pairs.
187
+ This string is used verbatim (not auto-quoted). Pass:
188
+ - Quoted terminals when you need a literal token (e.g. '","' or '"\\n"').
189
+ - Raw/non-terminals when you need grammar tokens (e.g. 'ws "," ws').
186
190
  """
187
191
  # =================================================================
188
192
  # Step 1: Determine the root tool calls rule
@@ -281,9 +285,7 @@ class EBNFComposer:
281
285
  # Add required properties joined by commas
282
286
  if required:
283
287
  rule_parts.append(
284
- f' "{key_value_separator}" '.join(
285
- prop_kv_pairs[k] for k in required
286
- )
288
+ f" {key_value_separator} ".join(prop_kv_pairs[k] for k in required)
287
289
  )
288
290
 
289
291
  # Add optional properties with flexible ordering
@@ -298,14 +300,14 @@ class EBNFComposer:
298
300
  opt_parts.append(prop_kv_pairs[optional[j]])
299
301
  else:
300
302
  opt_parts.append(
301
- f' ( "{key_value_separator}" {prop_kv_pairs[optional[j]]} )?'
303
+ f" ( {key_value_separator} {prop_kv_pairs[optional[j]]} )?"
302
304
  )
303
305
  opt_alternatives.append("".join(opt_parts))
304
306
 
305
307
  # Wrap with appropriate comma handling based on whether we have required properties
306
308
  if required:
307
309
  # Required properties exist, so optional group needs outer comma
308
- rule_parts.append(f' ( "{key_value_separator}" ( ')
310
+ rule_parts.append(f" ( {key_value_separator} ( ")
309
311
  rule_parts.append(" | ".join(opt_alternatives))
310
312
  rule_parts.append(" ) )?")
311
313
  else:
@@ -69,6 +69,8 @@ class FunctionCallParser:
69
69
  Returns:
70
70
  True if the text contains a tool call, False otherwise
71
71
  """
72
+ if not self.tools:
73
+ return False
72
74
  return self.detector.has_tool_call(text)
73
75
 
74
76
  def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
@@ -83,6 +85,8 @@ class FunctionCallParser:
83
85
  - The remaining text after parsing that was not consumed by the detector (can be treated as normal text)
84
86
  - A list of tool calls parsed from the text
85
87
  """
88
+ if not self.tools:
89
+ return full_text, []
86
90
  parsed_result = self.detector.detect_and_parse(full_text, self.tools)
87
91
  tool_call_list = parsed_result.calls
88
92
  if tool_call_list:
@@ -102,6 +106,8 @@ class FunctionCallParser:
102
106
  - The normal text that should be displayed to the user
103
107
  - A list of tool calls parsed from the chunk
104
108
  """
109
+ if not self.tools:
110
+ return chunk_text, []
105
111
  final_normal_text = ""
106
112
  final_calls = []
107
113
 
@@ -160,5 +160,5 @@ class Glm4MoeDetector(BaseFormatDetector):
160
160
  function_format="xml",
161
161
  call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
162
162
  key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
163
- key_value_separator="\\n",
163
+ key_value_separator='"\\n"',
164
164
  )
@@ -358,5 +358,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
358
358
  function_format="xml",
359
359
  call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
360
360
  key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
361
- key_value_separator="\\n",
361
+ key_value_separator='"\\n"',
362
362
  )
@@ -0,0 +1 @@
1
+ # SGLang gRPC module
@@ -0,0 +1,106 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # NO CHECKED-IN PROTOBUF GENCODE
4
+ # source: sglang_scheduler.proto
5
+ # Protobuf Python Version: 6.31.1
6
+ """Generated protocol buffer code."""
7
+ from google.protobuf import descriptor as _descriptor
8
+ from google.protobuf import descriptor_pool as _descriptor_pool
9
+ from google.protobuf import runtime_version as _runtime_version
10
+ from google.protobuf import symbol_database as _symbol_database
11
+ from google.protobuf.internal import builder as _builder
12
+ _runtime_version.ValidateProtobufRuntimeVersion(
13
+ _runtime_version.Domain.PUBLIC,
14
+ 6,
15
+ 31,
16
+ 1,
17
+ '',
18
+ 'sglang_scheduler.proto'
19
+ )
20
+ # @@protoc_insertion_point(imports)
21
+
22
+ _sym_db = _symbol_database.Default()
23
+
24
+
25
+ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2
26
+ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
27
+
28
+
29
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x16sglang_scheduler.proto\x12\x15sglang.grpc.scheduler\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1cgoogle/protobuf/struct.proto\"\xc7\x05\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x16\n\x0emax_new_tokens\x18\x08 \x01(\x05\x12\x0c\n\x04stop\x18\t \x03(\t\x12\x16\n\x0estop_token_ids\x18\n \x03(\x05\x12\x1b\n\x13skip_special_tokens\x18\x0b \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\x0c \x01(\x08\x12\x0f\n\x05regex\x18\r \x01(\tH\x00\x12\x15\n\x0bjson_schema\x18\x0e \x01(\tH\x00\x12\x16\n\x0c\x65\x62nf_grammar\x18\x0f \x01(\tH\x00\x12\x11\n\tlora_path\x18\x10 \x01(\t\x12\t\n\x01n\x18\x11 \x01(\x05\x12\x15\n\rtoken_healing\x18\x12 \x01(\x08\x12\x16\n\x0emin_new_tokens\x18\x13 \x01(\x05\x12\x12\n\nignore_eos\x18\x14 \x01(\x08\x12\x14\n\x0cno_stop_trim\x18\x15 \x01(\x08\x12\x17\n\x0fstream_interval\x18\x16 \x01(\x05\x12H\n\nlogit_bias\x18\x17 \x03(\x0b\x32\x34.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry\x12\x16\n\x0estructural_tag\x18\x18 \x01(\t\x12.\n\rcustom_params\x18\x19 \x01(\x0b\x32\x17.google.protobuf.Struct\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraint\"]\n\x13\x44isaggregatedParams\x12\x16\n\x0e\x62ootstrap_host\x18\x01 \x01(\t\x12\x16\n\x0e\x62ootstrap_port\x18\x02 \x01(\x05\x12\x16\n\x0e\x62ootstrap_room\x18\x03 \x01(\x05\"\xe9\x04\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x04 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x16\n\x0ereturn_logprob\x18\x05 \x01(\x08\x12\x19\n\x11logprob_start_len\x18\x06 \x01(\x05\x12\x18\n\x10top_logprobs_num\x18\x07 \x01(\x05\x12\x19\n\x11token_ids_logprob\x18\x08 \x03(\x05\x12\x1c\n\x14return_hidden_states\x18\t \x01(\x08\x12H\n\x14\x64isaggregated_params\x18\n \x01(\x0b\x32*.sglang.grpc.scheduler.DisaggregatedParams\x12\x1e\n\x16\x63ustom_logit_processor\x18\x0b \x01(\t\x12-\n\ttimestamp\x18\x0c \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x13\n\x0blog_metrics\x18\r \x01(\x08\x12\x14\n\x0cinput_embeds\x18\x0e \x03(\x02\x12\x0f\n\x07lora_id\x18\x0f \x01(\t\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x10 \x01(\x05\x12\x15\n\rdp_balance_id\x18\x11 \x01(\x05\":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\x05\"\xd3\x01\n\x10MultimodalInputs\x12\x12\n\nimage_urls\x18\x01 \x03(\t\x12\x12\n\nvideo_urls\x18\x02 \x03(\t\x12\x12\n\naudio_urls\x18\x03 \x03(\t\x12\x33\n\x12processed_features\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x12\n\nimage_data\x18\x05 \x03(\x0c\x12\x12\n\nvideo_data\x18\x06 \x03(\x0c\x12\x12\n\naudio_data\x18\x07 \x03(\x0c\x12\x12\n\nmodalities\x18\x08 \x03(\t\"\xe3\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12;\n\x05\x63hunk\x18\x02 \x01(\x0b\x32*.sglang.grpc.scheduler.GenerateStreamChunkH\x00\x12;\n\x08\x63omplete\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.GenerateCompleteH\x00\x12\x35\n\x05\x65rror\x18\x04 \x01(\x0b\x32$.sglang.grpc.scheduler.GenerateErrorH\x00\x42\n\n\x08response\"\xf5\x01\n\x13GenerateStreamChunk\x12\x10\n\x08token_id\x18\x01 \x01(\x05\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05\x12\x31\n\x08logprobs\x18\x06 \x01(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12\x15\n\rhidden_states\x18\x07 \x03(\x02\x12\x17\n\x0fgeneration_time\x18\x08 \x01(\x02\x12\x12\n\nqueue_time\x18\t \x01(\x05\"\xcd\x02\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\x05\x12\x13\n\x0boutput_text\x18\x02 \x01(\t\x12K\n\rfinish_reason\x18\x03 \x01(\x0e\x32\x34.sglang.grpc.scheduler.GenerateComplete.FinishReason\x12\x35\n\x0c\x61ll_logprobs\x18\x0b \x03(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12>\n\x11\x61ll_hidden_states\x18\x0c \x03(\x0b\x32#.sglang.grpc.scheduler.HiddenStates\"L\n\x0c\x46inishReason\x12\x08\n\x04STOP\x10\x00\x12\n\n\x06LENGTH\x10\x01\x12\r\n\tEOS_TOKEN\x10\x02\x12\x0c\n\x08STOP_STR\x10\x03\x12\t\n\x05\x41\x42ORT\x10\x04\"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"\x84\x01\n\x08LogProbs\x12\x16\n\x0etoken_logprobs\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x38\n\x0ctop_logprobs\x18\x03 \x03(\x0b\x32\".sglang.grpc.scheduler.TopLogProbs\x12\x13\n\x0btoken_texts\x18\x04 \x03(\t\"E\n\x0bTopLogProbs\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x13\n\x0btoken_texts\x18\x03 \x03(\t\"?\n\x0cHiddenStates\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05layer\x18\x02 \x01(\x05\x12\x10\n\x08position\x18\x03 \x01(\x05\"\xca\x02\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x04 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x05 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x13\n\x0blog_metrics\x18\x06 \x01(\x08\x12\x16\n\x0etoken_type_ids\x18\x07 \x03(\x05\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x08 \x01(\x05\x12\x18\n\x10is_cross_encoder\x18\t \x01(\x08\x12\r\n\x05texts\x18\n \x03(\t\"\x9d\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\x08\x63omplete\x18\x02 \x01(\x0b\x32$.sglang.grpc.scheduler.EmbedCompleteH\x00\x12\x32\n\x05\x65rror\x18\x03 \x01(\x0b\x32!.sglang.grpc.scheduler.EmbedErrorH\x00\x42\n\n\x08response\"\xbc\x01\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rcached_tokens\x18\x03 \x01(\x05\x12\x15\n\rembedding_dim\x18\x04 \x01(\x05\x12\x17\n\x0fgeneration_time\x18\x05 \x01(\x02\x12:\n\x10\x62\x61tch_embeddings\x18\x06 \x03(\x0b\x32 .sglang.grpc.scheduler.Embedding\"*\n\tEmbedding\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05index\x18\x02 \x01(\x05\"<\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"N\n\x12HealthCheckRequest\x12\x38\n\ttokenized\x18\x01 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t\"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"I\n\x0fLoadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\x12\x14\n\x0c\x61\x64\x61pter_path\x18\x02 \x01(\t\x12\x0c\n\x04rank\x18\x03 \x01(\x05\"H\n\x10LoadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nadapter_id\x18\x02 \x01(\t\x12\x0f\n\x07message\x18\x03 \x01(\t\"\'\n\x11UnloadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\"6\n\x12UnloadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"w\n\x14UpdateWeightsRequest\x12\x13\n\tdisk_path\x18\x01 \x01(\tH\x00\x12\x15\n\x0btensor_data\x18\x02 \x01(\x0cH\x00\x12\x14\n\nremote_url\x18\x03 \x01(\tH\x00\x12\x13\n\x0bweight_name\x18\x04 \x01(\tB\x08\n\x06source\"9\n\x15UpdateWeightsResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"-\n\x17GetInternalStateRequest\x12\x12\n\nstate_keys\x18\x01 \x03(\t\"B\n\x18GetInternalStateResponse\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"A\n\x17SetInternalStateRequest\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"<\n\x18SetInternalStateResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t2\xfe\x02\n\x0fSglangScheduler\x12]\n\x08Generate\x12&.sglang.grpc.scheduler.GenerateRequest\x1a\'.sglang.grpc.scheduler.GenerateResponse0\x01\x12R\n\x05\x45mbed\x12#.sglang.grpc.scheduler.EmbedRequest\x1a$.sglang.grpc.scheduler.EmbedResponse\x12\x64\n\x0bHealthCheck\x12).sglang.grpc.scheduler.HealthCheckRequest\x1a*.sglang.grpc.scheduler.HealthCheckResponse\x12R\n\x05\x41\x62ort\x12#.sglang.grpc.scheduler.AbortRequest\x1a$.sglang.grpc.scheduler.AbortResponseb\x06proto3')
30
+
31
+ _globals = globals()
32
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
33
+ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sglang_scheduler_pb2', _globals)
34
+ if not _descriptor._USE_C_DESCRIPTORS:
35
+ DESCRIPTOR._loaded_options = None
36
+ _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._loaded_options = None
37
+ _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_options = b'8\001'
38
+ _globals['_SAMPLINGPARAMS']._serialized_start=113
39
+ _globals['_SAMPLINGPARAMS']._serialized_end=824
40
+ _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_start=762
41
+ _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_end=810
42
+ _globals['_DISAGGREGATEDPARAMS']._serialized_start=826
43
+ _globals['_DISAGGREGATEDPARAMS']._serialized_end=919
44
+ _globals['_GENERATEREQUEST']._serialized_start=922
45
+ _globals['_GENERATEREQUEST']._serialized_end=1539
46
+ _globals['_TOKENIZEDINPUT']._serialized_start=1541
47
+ _globals['_TOKENIZEDINPUT']._serialized_end=1599
48
+ _globals['_MULTIMODALINPUTS']._serialized_start=1602
49
+ _globals['_MULTIMODALINPUTS']._serialized_end=1813
50
+ _globals['_GENERATERESPONSE']._serialized_start=1816
51
+ _globals['_GENERATERESPONSE']._serialized_end=2043
52
+ _globals['_GENERATESTREAMCHUNK']._serialized_start=2046
53
+ _globals['_GENERATESTREAMCHUNK']._serialized_end=2291
54
+ _globals['_GENERATECOMPLETE']._serialized_start=2294
55
+ _globals['_GENERATECOMPLETE']._serialized_end=2627
56
+ _globals['_GENERATECOMPLETE_FINISHREASON']._serialized_start=2551
57
+ _globals['_GENERATECOMPLETE_FINISHREASON']._serialized_end=2627
58
+ _globals['_GENERATEERROR']._serialized_start=2629
59
+ _globals['_GENERATEERROR']._serialized_end=2704
60
+ _globals['_LOGPROBS']._serialized_start=2707
61
+ _globals['_LOGPROBS']._serialized_end=2839
62
+ _globals['_TOPLOGPROBS']._serialized_start=2841
63
+ _globals['_TOPLOGPROBS']._serialized_end=2910
64
+ _globals['_HIDDENSTATES']._serialized_start=2912
65
+ _globals['_HIDDENSTATES']._serialized_end=2975
66
+ _globals['_EMBEDREQUEST']._serialized_start=2978
67
+ _globals['_EMBEDREQUEST']._serialized_end=3308
68
+ _globals['_EMBEDRESPONSE']._serialized_start=3311
69
+ _globals['_EMBEDRESPONSE']._serialized_end=3468
70
+ _globals['_EMBEDCOMPLETE']._serialized_start=3471
71
+ _globals['_EMBEDCOMPLETE']._serialized_end=3659
72
+ _globals['_EMBEDDING']._serialized_start=3661
73
+ _globals['_EMBEDDING']._serialized_end=3703
74
+ _globals['_EMBEDERROR']._serialized_start=3705
75
+ _globals['_EMBEDERROR']._serialized_end=3765
76
+ _globals['_HEALTHCHECKREQUEST']._serialized_start=3767
77
+ _globals['_HEALTHCHECKREQUEST']._serialized_end=3845
78
+ _globals['_HEALTHCHECKRESPONSE']._serialized_start=3847
79
+ _globals['_HEALTHCHECKRESPONSE']._serialized_end=3902
80
+ _globals['_ABORTREQUEST']._serialized_start=3904
81
+ _globals['_ABORTREQUEST']._serialized_end=3954
82
+ _globals['_ABORTRESPONSE']._serialized_start=3956
83
+ _globals['_ABORTRESPONSE']._serialized_end=4005
84
+ _globals['_LOADLORAREQUEST']._serialized_start=4007
85
+ _globals['_LOADLORAREQUEST']._serialized_end=4080
86
+ _globals['_LOADLORARESPONSE']._serialized_start=4082
87
+ _globals['_LOADLORARESPONSE']._serialized_end=4154
88
+ _globals['_UNLOADLORAREQUEST']._serialized_start=4156
89
+ _globals['_UNLOADLORAREQUEST']._serialized_end=4195
90
+ _globals['_UNLOADLORARESPONSE']._serialized_start=4197
91
+ _globals['_UNLOADLORARESPONSE']._serialized_end=4251
92
+ _globals['_UPDATEWEIGHTSREQUEST']._serialized_start=4253
93
+ _globals['_UPDATEWEIGHTSREQUEST']._serialized_end=4372
94
+ _globals['_UPDATEWEIGHTSRESPONSE']._serialized_start=4374
95
+ _globals['_UPDATEWEIGHTSRESPONSE']._serialized_end=4431
96
+ _globals['_GETINTERNALSTATEREQUEST']._serialized_start=4433
97
+ _globals['_GETINTERNALSTATEREQUEST']._serialized_end=4478
98
+ _globals['_GETINTERNALSTATERESPONSE']._serialized_start=4480
99
+ _globals['_GETINTERNALSTATERESPONSE']._serialized_end=4546
100
+ _globals['_SETINTERNALSTATEREQUEST']._serialized_start=4548
101
+ _globals['_SETINTERNALSTATEREQUEST']._serialized_end=4613
102
+ _globals['_SETINTERNALSTATERESPONSE']._serialized_start=4615
103
+ _globals['_SETINTERNALSTATERESPONSE']._serialized_end=4675
104
+ _globals['_SGLANGSCHEDULER']._serialized_start=4678
105
+ _globals['_SGLANGSCHEDULER']._serialized_end=5060
106
+ # @@protoc_insertion_point(module_scope)