sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/device_config.py +3 -1
  5. sglang/srt/configs/dots_vlm.py +139 -0
  6. sglang/srt/configs/load_config.py +1 -0
  7. sglang/srt/configs/model_config.py +50 -6
  8. sglang/srt/configs/qwen3_next.py +326 -0
  9. sglang/srt/connector/__init__.py +8 -1
  10. sglang/srt/connector/remote_instance.py +82 -0
  11. sglang/srt/constrained/base_grammar_backend.py +48 -12
  12. sglang/srt/constrained/llguidance_backend.py +0 -1
  13. sglang/srt/constrained/outlines_backend.py +0 -1
  14. sglang/srt/constrained/xgrammar_backend.py +28 -9
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/base/conn.py +1 -1
  21. sglang/srt/disaggregation/common/conn.py +15 -12
  22. sglang/srt/disaggregation/decode.py +21 -10
  23. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -445
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +5 -3
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +24 -3
  31. sglang/srt/entrypoints/engine.py +38 -17
  32. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  33. sglang/srt/entrypoints/grpc_server.py +680 -0
  34. sglang/srt/entrypoints/http_server.py +85 -54
  35. sglang/srt/entrypoints/openai/protocol.py +4 -1
  36. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  37. sglang/srt/entrypoints/openai/serving_chat.py +36 -16
  38. sglang/srt/entrypoints/openai/serving_completions.py +12 -3
  39. sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
  40. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  41. sglang/srt/entrypoints/openai/serving_responses.py +6 -3
  42. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  43. sglang/srt/eplb/eplb_manager.py +2 -2
  44. sglang/srt/eplb/expert_distribution.py +26 -13
  45. sglang/srt/eplb/expert_location.py +8 -3
  46. sglang/srt/eplb/expert_location_updater.py +1 -1
  47. sglang/srt/function_call/base_format_detector.py +3 -6
  48. sglang/srt/function_call/ebnf_composer.py +11 -9
  49. sglang/srt/function_call/function_call_parser.py +6 -0
  50. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  51. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  52. sglang/srt/grpc/__init__.py +1 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  56. sglang/srt/hf_transformers_utils.py +4 -0
  57. sglang/srt/layers/activation.py +142 -9
  58. sglang/srt/layers/attention/ascend_backend.py +11 -4
  59. sglang/srt/layers/attention/fla/chunk.py +242 -0
  60. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  61. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  62. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  63. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  66. sglang/srt/layers/attention/fla/index.py +37 -0
  67. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  68. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  69. sglang/srt/layers/attention/fla/op.py +66 -0
  70. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  71. sglang/srt/layers/attention/fla/utils.py +331 -0
  72. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  73. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  74. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  75. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  76. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  77. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  78. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  79. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  80. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  81. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  82. sglang/srt/layers/attention/triton_backend.py +18 -1
  83. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  84. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  85. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  86. sglang/srt/layers/dp_attention.py +30 -1
  87. sglang/srt/layers/layernorm.py +32 -15
  88. sglang/srt/layers/linear.py +34 -3
  89. sglang/srt/layers/logits_processor.py +29 -10
  90. sglang/srt/layers/moe/__init__.py +2 -1
  91. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  92. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  93. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  94. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  95. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  96. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  100. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  107. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  108. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  109. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  110. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  111. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  112. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  113. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  114. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  115. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  116. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  117. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  118. sglang/srt/layers/moe/topk.py +30 -9
  119. sglang/srt/layers/moe/utils.py +12 -6
  120. sglang/srt/layers/quantization/awq.py +19 -7
  121. sglang/srt/layers/quantization/base_config.py +11 -6
  122. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  123. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  124. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  125. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  126. sglang/srt/layers/quantization/fp8.py +76 -47
  127. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  128. sglang/srt/layers/quantization/gptq.py +25 -17
  129. sglang/srt/layers/quantization/modelopt_quant.py +147 -47
  130. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  131. sglang/srt/layers/quantization/mxfp4.py +64 -40
  132. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  133. sglang/srt/layers/quantization/unquant.py +135 -47
  134. sglang/srt/layers/quantization/w4afp8.py +30 -17
  135. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  136. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  137. sglang/srt/layers/sampler.py +162 -18
  138. sglang/srt/lora/backend/base_backend.py +50 -8
  139. sglang/srt/lora/backend/triton_backend.py +90 -2
  140. sglang/srt/lora/layers.py +32 -0
  141. sglang/srt/lora/lora.py +4 -1
  142. sglang/srt/lora/lora_manager.py +35 -112
  143. sglang/srt/lora/mem_pool.py +24 -10
  144. sglang/srt/lora/utils.py +18 -9
  145. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  146. sglang/srt/managers/cache_controller.py +158 -160
  147. sglang/srt/managers/data_parallel_controller.py +105 -35
  148. sglang/srt/managers/detokenizer_manager.py +8 -4
  149. sglang/srt/managers/disagg_service.py +46 -0
  150. sglang/srt/managers/io_struct.py +199 -12
  151. sglang/srt/managers/mm_utils.py +1 -0
  152. sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
  153. sglang/srt/managers/schedule_batch.py +77 -56
  154. sglang/srt/managers/schedule_policy.py +1 -1
  155. sglang/srt/managers/scheduler.py +187 -39
  156. sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
  157. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  158. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  159. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  160. sglang/srt/managers/tokenizer_manager.py +259 -519
  161. sglang/srt/managers/tp_worker.py +53 -4
  162. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  163. sglang/srt/mem_cache/hicache_storage.py +3 -23
  164. sglang/srt/mem_cache/hiradix_cache.py +103 -43
  165. sglang/srt/mem_cache/memory_pool.py +347 -48
  166. sglang/srt/mem_cache/memory_pool_host.py +105 -46
  167. sglang/srt/mem_cache/radix_cache.py +0 -2
  168. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  169. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  170. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
  171. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  172. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  173. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
  174. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  175. sglang/srt/metrics/collector.py +493 -76
  176. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  177. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  178. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  179. sglang/srt/model_executor/forward_batch_info.py +59 -2
  180. sglang/srt/model_executor/model_runner.py +356 -29
  181. sglang/srt/model_loader/__init__.py +9 -3
  182. sglang/srt/model_loader/loader.py +128 -4
  183. sglang/srt/model_loader/weight_utils.py +2 -1
  184. sglang/srt/models/apertus.py +686 -0
  185. sglang/srt/models/bailing_moe.py +798 -218
  186. sglang/srt/models/bailing_moe_nextn.py +168 -0
  187. sglang/srt/models/deepseek_v2.py +109 -15
  188. sglang/srt/models/dots_vlm.py +174 -0
  189. sglang/srt/models/dots_vlm_vit.py +337 -0
  190. sglang/srt/models/ernie4.py +1 -1
  191. sglang/srt/models/gemma3n_mm.py +1 -1
  192. sglang/srt/models/glm4_moe.py +1 -1
  193. sglang/srt/models/glm4v.py +4 -2
  194. sglang/srt/models/glm4v_moe.py +3 -0
  195. sglang/srt/models/gpt_oss.py +1 -1
  196. sglang/srt/models/llama4.py +9 -0
  197. sglang/srt/models/llama_eagle3.py +13 -0
  198. sglang/srt/models/longcat_flash.py +2 -2
  199. sglang/srt/models/mllama4.py +25 -0
  200. sglang/srt/models/opt.py +637 -0
  201. sglang/srt/models/qwen2.py +7 -0
  202. sglang/srt/models/qwen2_5_vl.py +27 -3
  203. sglang/srt/models/qwen2_moe.py +56 -12
  204. sglang/srt/models/qwen3_moe.py +1 -1
  205. sglang/srt/models/qwen3_next.py +1042 -0
  206. sglang/srt/models/qwen3_next_mtp.py +112 -0
  207. sglang/srt/models/step3_vl.py +1 -1
  208. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  209. sglang/srt/multimodal/processors/glm4v.py +9 -9
  210. sglang/srt/multimodal/processors/internvl.py +141 -129
  211. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  212. sglang/srt/offloader.py +27 -3
  213. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  214. sglang/srt/sampling/sampling_batch_info.py +18 -15
  215. sglang/srt/server_args.py +276 -35
  216. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  217. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  218. sglang/srt/speculative/eagle_utils.py +0 -2
  219. sglang/srt/speculative/eagle_worker.py +43 -4
  220. sglang/srt/speculative/spec_info.py +5 -0
  221. sglang/srt/speculative/standalone_worker.py +109 -0
  222. sglang/srt/tracing/trace.py +552 -0
  223. sglang/srt/utils.py +34 -3
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  226. sglang/test/runners.py +4 -0
  227. sglang/test/test_cutlass_moe.py +24 -6
  228. sglang/test/test_disaggregation_utils.py +66 -0
  229. sglang/test/test_fp4_moe.py +370 -1
  230. sglang/test/test_utils.py +28 -1
  231. sglang/utils.py +11 -0
  232. sglang/version.py +1 -1
  233. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  234. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
  235. sglang/srt/disaggregation/launch_lb.py +0 -118
  236. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  237. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  238. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,427 @@
1
+ import datetime
2
+
3
+ from google.protobuf import timestamp_pb2 as _timestamp_pb2
4
+ from google.protobuf import struct_pb2 as _struct_pb2
5
+ from google.protobuf.internal import containers as _containers
6
+ from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
7
+ from google.protobuf import descriptor as _descriptor
8
+ from google.protobuf import message as _message
9
+ from collections.abc import Iterable as _Iterable, Mapping as _Mapping
10
+ from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
11
+
12
+ DESCRIPTOR: _descriptor.FileDescriptor
13
+
14
+ class SamplingParams(_message.Message):
15
+ __slots__ = ("temperature", "top_p", "top_k", "min_p", "frequency_penalty", "presence_penalty", "repetition_penalty", "max_new_tokens", "stop", "stop_token_ids", "skip_special_tokens", "spaces_between_special_tokens", "regex", "json_schema", "ebnf_grammar", "lora_path", "n", "token_healing", "min_new_tokens", "ignore_eos", "no_stop_trim", "stream_interval", "logit_bias", "structural_tag", "custom_params")
16
+ class LogitBiasEntry(_message.Message):
17
+ __slots__ = ("key", "value")
18
+ KEY_FIELD_NUMBER: _ClassVar[int]
19
+ VALUE_FIELD_NUMBER: _ClassVar[int]
20
+ key: str
21
+ value: float
22
+ def __init__(self, key: _Optional[str] = ..., value: _Optional[float] = ...) -> None: ...
23
+ TEMPERATURE_FIELD_NUMBER: _ClassVar[int]
24
+ TOP_P_FIELD_NUMBER: _ClassVar[int]
25
+ TOP_K_FIELD_NUMBER: _ClassVar[int]
26
+ MIN_P_FIELD_NUMBER: _ClassVar[int]
27
+ FREQUENCY_PENALTY_FIELD_NUMBER: _ClassVar[int]
28
+ PRESENCE_PENALTY_FIELD_NUMBER: _ClassVar[int]
29
+ REPETITION_PENALTY_FIELD_NUMBER: _ClassVar[int]
30
+ MAX_NEW_TOKENS_FIELD_NUMBER: _ClassVar[int]
31
+ STOP_FIELD_NUMBER: _ClassVar[int]
32
+ STOP_TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
33
+ SKIP_SPECIAL_TOKENS_FIELD_NUMBER: _ClassVar[int]
34
+ SPACES_BETWEEN_SPECIAL_TOKENS_FIELD_NUMBER: _ClassVar[int]
35
+ REGEX_FIELD_NUMBER: _ClassVar[int]
36
+ JSON_SCHEMA_FIELD_NUMBER: _ClassVar[int]
37
+ EBNF_GRAMMAR_FIELD_NUMBER: _ClassVar[int]
38
+ LORA_PATH_FIELD_NUMBER: _ClassVar[int]
39
+ N_FIELD_NUMBER: _ClassVar[int]
40
+ TOKEN_HEALING_FIELD_NUMBER: _ClassVar[int]
41
+ MIN_NEW_TOKENS_FIELD_NUMBER: _ClassVar[int]
42
+ IGNORE_EOS_FIELD_NUMBER: _ClassVar[int]
43
+ NO_STOP_TRIM_FIELD_NUMBER: _ClassVar[int]
44
+ STREAM_INTERVAL_FIELD_NUMBER: _ClassVar[int]
45
+ LOGIT_BIAS_FIELD_NUMBER: _ClassVar[int]
46
+ STRUCTURAL_TAG_FIELD_NUMBER: _ClassVar[int]
47
+ CUSTOM_PARAMS_FIELD_NUMBER: _ClassVar[int]
48
+ temperature: float
49
+ top_p: float
50
+ top_k: int
51
+ min_p: float
52
+ frequency_penalty: float
53
+ presence_penalty: float
54
+ repetition_penalty: float
55
+ max_new_tokens: int
56
+ stop: _containers.RepeatedScalarFieldContainer[str]
57
+ stop_token_ids: _containers.RepeatedScalarFieldContainer[int]
58
+ skip_special_tokens: bool
59
+ spaces_between_special_tokens: bool
60
+ regex: str
61
+ json_schema: str
62
+ ebnf_grammar: str
63
+ lora_path: str
64
+ n: int
65
+ token_healing: bool
66
+ min_new_tokens: int
67
+ ignore_eos: bool
68
+ no_stop_trim: bool
69
+ stream_interval: int
70
+ logit_bias: _containers.ScalarMap[str, float]
71
+ structural_tag: str
72
+ custom_params: _struct_pb2.Struct
73
+ def __init__(self, temperature: _Optional[float] = ..., top_p: _Optional[float] = ..., top_k: _Optional[int] = ..., min_p: _Optional[float] = ..., frequency_penalty: _Optional[float] = ..., presence_penalty: _Optional[float] = ..., repetition_penalty: _Optional[float] = ..., max_new_tokens: _Optional[int] = ..., stop: _Optional[_Iterable[str]] = ..., stop_token_ids: _Optional[_Iterable[int]] = ..., skip_special_tokens: bool = ..., spaces_between_special_tokens: bool = ..., regex: _Optional[str] = ..., json_schema: _Optional[str] = ..., ebnf_grammar: _Optional[str] = ..., lora_path: _Optional[str] = ..., n: _Optional[int] = ..., token_healing: bool = ..., min_new_tokens: _Optional[int] = ..., ignore_eos: bool = ..., no_stop_trim: bool = ..., stream_interval: _Optional[int] = ..., logit_bias: _Optional[_Mapping[str, float]] = ..., structural_tag: _Optional[str] = ..., custom_params: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ...) -> None: ...
74
+
75
+ class DisaggregatedParams(_message.Message):
76
+ __slots__ = ("bootstrap_host", "bootstrap_port", "bootstrap_room")
77
+ BOOTSTRAP_HOST_FIELD_NUMBER: _ClassVar[int]
78
+ BOOTSTRAP_PORT_FIELD_NUMBER: _ClassVar[int]
79
+ BOOTSTRAP_ROOM_FIELD_NUMBER: _ClassVar[int]
80
+ bootstrap_host: str
81
+ bootstrap_port: int
82
+ bootstrap_room: int
83
+ def __init__(self, bootstrap_host: _Optional[str] = ..., bootstrap_port: _Optional[int] = ..., bootstrap_room: _Optional[int] = ...) -> None: ...
84
+
85
+ class GenerateRequest(_message.Message):
86
+ __slots__ = ("request_id", "tokenized", "mm_inputs", "sampling_params", "return_logprob", "logprob_start_len", "top_logprobs_num", "token_ids_logprob", "return_hidden_states", "disaggregated_params", "custom_logit_processor", "timestamp", "log_metrics", "input_embeds", "lora_id", "data_parallel_rank", "dp_balance_id")
87
+ REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
88
+ TOKENIZED_FIELD_NUMBER: _ClassVar[int]
89
+ MM_INPUTS_FIELD_NUMBER: _ClassVar[int]
90
+ SAMPLING_PARAMS_FIELD_NUMBER: _ClassVar[int]
91
+ RETURN_LOGPROB_FIELD_NUMBER: _ClassVar[int]
92
+ LOGPROB_START_LEN_FIELD_NUMBER: _ClassVar[int]
93
+ TOP_LOGPROBS_NUM_FIELD_NUMBER: _ClassVar[int]
94
+ TOKEN_IDS_LOGPROB_FIELD_NUMBER: _ClassVar[int]
95
+ RETURN_HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int]
96
+ DISAGGREGATED_PARAMS_FIELD_NUMBER: _ClassVar[int]
97
+ CUSTOM_LOGIT_PROCESSOR_FIELD_NUMBER: _ClassVar[int]
98
+ TIMESTAMP_FIELD_NUMBER: _ClassVar[int]
99
+ LOG_METRICS_FIELD_NUMBER: _ClassVar[int]
100
+ INPUT_EMBEDS_FIELD_NUMBER: _ClassVar[int]
101
+ LORA_ID_FIELD_NUMBER: _ClassVar[int]
102
+ DATA_PARALLEL_RANK_FIELD_NUMBER: _ClassVar[int]
103
+ DP_BALANCE_ID_FIELD_NUMBER: _ClassVar[int]
104
+ request_id: str
105
+ tokenized: TokenizedInput
106
+ mm_inputs: MultimodalInputs
107
+ sampling_params: SamplingParams
108
+ return_logprob: bool
109
+ logprob_start_len: int
110
+ top_logprobs_num: int
111
+ token_ids_logprob: _containers.RepeatedScalarFieldContainer[int]
112
+ return_hidden_states: bool
113
+ disaggregated_params: DisaggregatedParams
114
+ custom_logit_processor: str
115
+ timestamp: _timestamp_pb2.Timestamp
116
+ log_metrics: bool
117
+ input_embeds: _containers.RepeatedScalarFieldContainer[float]
118
+ lora_id: str
119
+ data_parallel_rank: int
120
+ dp_balance_id: int
121
+ def __init__(self, request_id: _Optional[str] = ..., tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ..., mm_inputs: _Optional[_Union[MultimodalInputs, _Mapping]] = ..., sampling_params: _Optional[_Union[SamplingParams, _Mapping]] = ..., return_logprob: bool = ..., logprob_start_len: _Optional[int] = ..., top_logprobs_num: _Optional[int] = ..., token_ids_logprob: _Optional[_Iterable[int]] = ..., return_hidden_states: bool = ..., disaggregated_params: _Optional[_Union[DisaggregatedParams, _Mapping]] = ..., custom_logit_processor: _Optional[str] = ..., timestamp: _Optional[_Union[datetime.datetime, _timestamp_pb2.Timestamp, _Mapping]] = ..., log_metrics: bool = ..., input_embeds: _Optional[_Iterable[float]] = ..., lora_id: _Optional[str] = ..., data_parallel_rank: _Optional[int] = ..., dp_balance_id: _Optional[int] = ...) -> None: ...
122
+
123
+ class TokenizedInput(_message.Message):
124
+ __slots__ = ("original_text", "input_ids")
125
+ ORIGINAL_TEXT_FIELD_NUMBER: _ClassVar[int]
126
+ INPUT_IDS_FIELD_NUMBER: _ClassVar[int]
127
+ original_text: str
128
+ input_ids: _containers.RepeatedScalarFieldContainer[int]
129
+ def __init__(self, original_text: _Optional[str] = ..., input_ids: _Optional[_Iterable[int]] = ...) -> None: ...
130
+
131
+ class MultimodalInputs(_message.Message):
132
+ __slots__ = ("image_urls", "video_urls", "audio_urls", "processed_features", "image_data", "video_data", "audio_data", "modalities")
133
+ IMAGE_URLS_FIELD_NUMBER: _ClassVar[int]
134
+ VIDEO_URLS_FIELD_NUMBER: _ClassVar[int]
135
+ AUDIO_URLS_FIELD_NUMBER: _ClassVar[int]
136
+ PROCESSED_FEATURES_FIELD_NUMBER: _ClassVar[int]
137
+ IMAGE_DATA_FIELD_NUMBER: _ClassVar[int]
138
+ VIDEO_DATA_FIELD_NUMBER: _ClassVar[int]
139
+ AUDIO_DATA_FIELD_NUMBER: _ClassVar[int]
140
+ MODALITIES_FIELD_NUMBER: _ClassVar[int]
141
+ image_urls: _containers.RepeatedScalarFieldContainer[str]
142
+ video_urls: _containers.RepeatedScalarFieldContainer[str]
143
+ audio_urls: _containers.RepeatedScalarFieldContainer[str]
144
+ processed_features: _struct_pb2.Struct
145
+ image_data: _containers.RepeatedScalarFieldContainer[bytes]
146
+ video_data: _containers.RepeatedScalarFieldContainer[bytes]
147
+ audio_data: _containers.RepeatedScalarFieldContainer[bytes]
148
+ modalities: _containers.RepeatedScalarFieldContainer[str]
149
+ def __init__(self, image_urls: _Optional[_Iterable[str]] = ..., video_urls: _Optional[_Iterable[str]] = ..., audio_urls: _Optional[_Iterable[str]] = ..., processed_features: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ..., image_data: _Optional[_Iterable[bytes]] = ..., video_data: _Optional[_Iterable[bytes]] = ..., audio_data: _Optional[_Iterable[bytes]] = ..., modalities: _Optional[_Iterable[str]] = ...) -> None: ...
150
+
151
+ class GenerateResponse(_message.Message):
152
+ __slots__ = ("request_id", "chunk", "complete", "error")
153
+ REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
154
+ CHUNK_FIELD_NUMBER: _ClassVar[int]
155
+ COMPLETE_FIELD_NUMBER: _ClassVar[int]
156
+ ERROR_FIELD_NUMBER: _ClassVar[int]
157
+ request_id: str
158
+ chunk: GenerateStreamChunk
159
+ complete: GenerateComplete
160
+ error: GenerateError
161
+ def __init__(self, request_id: _Optional[str] = ..., chunk: _Optional[_Union[GenerateStreamChunk, _Mapping]] = ..., complete: _Optional[_Union[GenerateComplete, _Mapping]] = ..., error: _Optional[_Union[GenerateError, _Mapping]] = ...) -> None: ...
162
+
163
+ class GenerateStreamChunk(_message.Message):
164
+ __slots__ = ("token_id", "text", "prompt_tokens", "completion_tokens", "cached_tokens", "logprobs", "hidden_states", "generation_time", "queue_time")
165
+ TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
166
+ TEXT_FIELD_NUMBER: _ClassVar[int]
167
+ PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int]
168
+ COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int]
169
+ CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int]
170
+ LOGPROBS_FIELD_NUMBER: _ClassVar[int]
171
+ HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int]
172
+ GENERATION_TIME_FIELD_NUMBER: _ClassVar[int]
173
+ QUEUE_TIME_FIELD_NUMBER: _ClassVar[int]
174
+ token_id: int
175
+ text: str
176
+ prompt_tokens: int
177
+ completion_tokens: int
178
+ cached_tokens: int
179
+ logprobs: LogProbs
180
+ hidden_states: _containers.RepeatedScalarFieldContainer[float]
181
+ generation_time: float
182
+ queue_time: int
183
+ def __init__(self, token_id: _Optional[int] = ..., text: _Optional[str] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., logprobs: _Optional[_Union[LogProbs, _Mapping]] = ..., hidden_states: _Optional[_Iterable[float]] = ..., generation_time: _Optional[float] = ..., queue_time: _Optional[int] = ...) -> None: ...
184
+
185
+ class GenerateComplete(_message.Message):
186
+ __slots__ = ("output_ids", "output_text", "finish_reason", "all_logprobs", "all_hidden_states")
187
+ class FinishReason(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
188
+ __slots__ = ()
189
+ STOP: _ClassVar[GenerateComplete.FinishReason]
190
+ LENGTH: _ClassVar[GenerateComplete.FinishReason]
191
+ EOS_TOKEN: _ClassVar[GenerateComplete.FinishReason]
192
+ STOP_STR: _ClassVar[GenerateComplete.FinishReason]
193
+ ABORT: _ClassVar[GenerateComplete.FinishReason]
194
+ STOP: GenerateComplete.FinishReason
195
+ LENGTH: GenerateComplete.FinishReason
196
+ EOS_TOKEN: GenerateComplete.FinishReason
197
+ STOP_STR: GenerateComplete.FinishReason
198
+ ABORT: GenerateComplete.FinishReason
199
+ OUTPUT_IDS_FIELD_NUMBER: _ClassVar[int]
200
+ OUTPUT_TEXT_FIELD_NUMBER: _ClassVar[int]
201
+ FINISH_REASON_FIELD_NUMBER: _ClassVar[int]
202
+ ALL_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
203
+ ALL_HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int]
204
+ output_ids: _containers.RepeatedScalarFieldContainer[int]
205
+ output_text: str
206
+ finish_reason: GenerateComplete.FinishReason
207
+ all_logprobs: _containers.RepeatedCompositeFieldContainer[LogProbs]
208
+ all_hidden_states: _containers.RepeatedCompositeFieldContainer[HiddenStates]
209
+ def __init__(self, output_ids: _Optional[_Iterable[int]] = ..., output_text: _Optional[str] = ..., finish_reason: _Optional[_Union[GenerateComplete.FinishReason, str]] = ..., all_logprobs: _Optional[_Iterable[_Union[LogProbs, _Mapping]]] = ..., all_hidden_states: _Optional[_Iterable[_Union[HiddenStates, _Mapping]]] = ...) -> None: ...
210
+
211
+ class GenerateError(_message.Message):
212
+ __slots__ = ("message", "http_status_code", "details")
213
+ MESSAGE_FIELD_NUMBER: _ClassVar[int]
214
+ HTTP_STATUS_CODE_FIELD_NUMBER: _ClassVar[int]
215
+ DETAILS_FIELD_NUMBER: _ClassVar[int]
216
+ message: str
217
+ http_status_code: str
218
+ details: str
219
+ def __init__(self, message: _Optional[str] = ..., http_status_code: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ...
220
+
221
+ class LogProbs(_message.Message):
222
+ __slots__ = ("token_logprobs", "token_ids", "top_logprobs", "token_texts")
223
+ TOKEN_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
224
+ TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
225
+ TOP_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
226
+ TOKEN_TEXTS_FIELD_NUMBER: _ClassVar[int]
227
+ token_logprobs: _containers.RepeatedScalarFieldContainer[float]
228
+ token_ids: _containers.RepeatedScalarFieldContainer[int]
229
+ top_logprobs: _containers.RepeatedCompositeFieldContainer[TopLogProbs]
230
+ token_texts: _containers.RepeatedScalarFieldContainer[str]
231
+ def __init__(self, token_logprobs: _Optional[_Iterable[float]] = ..., token_ids: _Optional[_Iterable[int]] = ..., top_logprobs: _Optional[_Iterable[_Union[TopLogProbs, _Mapping]]] = ..., token_texts: _Optional[_Iterable[str]] = ...) -> None: ...
232
+
233
+ class TopLogProbs(_message.Message):
234
+ __slots__ = ("values", "token_ids", "token_texts")
235
+ VALUES_FIELD_NUMBER: _ClassVar[int]
236
+ TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
237
+ TOKEN_TEXTS_FIELD_NUMBER: _ClassVar[int]
238
+ values: _containers.RepeatedScalarFieldContainer[float]
239
+ token_ids: _containers.RepeatedScalarFieldContainer[int]
240
+ token_texts: _containers.RepeatedScalarFieldContainer[str]
241
+ def __init__(self, values: _Optional[_Iterable[float]] = ..., token_ids: _Optional[_Iterable[int]] = ..., token_texts: _Optional[_Iterable[str]] = ...) -> None: ...
242
+
243
+ class HiddenStates(_message.Message):
244
+ __slots__ = ("values", "layer", "position")
245
+ VALUES_FIELD_NUMBER: _ClassVar[int]
246
+ LAYER_FIELD_NUMBER: _ClassVar[int]
247
+ POSITION_FIELD_NUMBER: _ClassVar[int]
248
+ values: _containers.RepeatedScalarFieldContainer[float]
249
+ layer: int
250
+ position: int
251
+ def __init__(self, values: _Optional[_Iterable[float]] = ..., layer: _Optional[int] = ..., position: _Optional[int] = ...) -> None: ...
252
+
253
+ class EmbedRequest(_message.Message):
254
+ __slots__ = ("request_id", "tokenized", "mm_inputs", "sampling_params", "log_metrics", "token_type_ids", "data_parallel_rank", "is_cross_encoder", "texts")
255
+ REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
256
+ TOKENIZED_FIELD_NUMBER: _ClassVar[int]
257
+ MM_INPUTS_FIELD_NUMBER: _ClassVar[int]
258
+ SAMPLING_PARAMS_FIELD_NUMBER: _ClassVar[int]
259
+ LOG_METRICS_FIELD_NUMBER: _ClassVar[int]
260
+ TOKEN_TYPE_IDS_FIELD_NUMBER: _ClassVar[int]
261
+ DATA_PARALLEL_RANK_FIELD_NUMBER: _ClassVar[int]
262
+ IS_CROSS_ENCODER_FIELD_NUMBER: _ClassVar[int]
263
+ TEXTS_FIELD_NUMBER: _ClassVar[int]
264
+ request_id: str
265
+ tokenized: TokenizedInput
266
+ mm_inputs: MultimodalInputs
267
+ sampling_params: SamplingParams
268
+ log_metrics: bool
269
+ token_type_ids: _containers.RepeatedScalarFieldContainer[int]
270
+ data_parallel_rank: int
271
+ is_cross_encoder: bool
272
+ texts: _containers.RepeatedScalarFieldContainer[str]
273
+ def __init__(self, request_id: _Optional[str] = ..., tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ..., mm_inputs: _Optional[_Union[MultimodalInputs, _Mapping]] = ..., sampling_params: _Optional[_Union[SamplingParams, _Mapping]] = ..., log_metrics: bool = ..., token_type_ids: _Optional[_Iterable[int]] = ..., data_parallel_rank: _Optional[int] = ..., is_cross_encoder: bool = ..., texts: _Optional[_Iterable[str]] = ...) -> None: ...
274
+
275
+ class EmbedResponse(_message.Message):
276
+ __slots__ = ("request_id", "complete", "error")
277
+ REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
278
+ COMPLETE_FIELD_NUMBER: _ClassVar[int]
279
+ ERROR_FIELD_NUMBER: _ClassVar[int]
280
+ request_id: str
281
+ complete: EmbedComplete
282
+ error: EmbedError
283
+ def __init__(self, request_id: _Optional[str] = ..., complete: _Optional[_Union[EmbedComplete, _Mapping]] = ..., error: _Optional[_Union[EmbedError, _Mapping]] = ...) -> None: ...
284
+
285
+ class EmbedComplete(_message.Message):
286
+ __slots__ = ("embedding", "prompt_tokens", "cached_tokens", "embedding_dim", "generation_time", "batch_embeddings")
287
+ EMBEDDING_FIELD_NUMBER: _ClassVar[int]
288
+ PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int]
289
+ CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int]
290
+ EMBEDDING_DIM_FIELD_NUMBER: _ClassVar[int]
291
+ GENERATION_TIME_FIELD_NUMBER: _ClassVar[int]
292
+ BATCH_EMBEDDINGS_FIELD_NUMBER: _ClassVar[int]
293
+ embedding: _containers.RepeatedScalarFieldContainer[float]
294
+ prompt_tokens: int
295
+ cached_tokens: int
296
+ embedding_dim: int
297
+ generation_time: float
298
+ batch_embeddings: _containers.RepeatedCompositeFieldContainer[Embedding]
299
+ def __init__(self, embedding: _Optional[_Iterable[float]] = ..., prompt_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., embedding_dim: _Optional[int] = ..., generation_time: _Optional[float] = ..., batch_embeddings: _Optional[_Iterable[_Union[Embedding, _Mapping]]] = ...) -> None: ...
300
+
301
+ class Embedding(_message.Message):
302
+ __slots__ = ("values", "index")
303
+ VALUES_FIELD_NUMBER: _ClassVar[int]
304
+ INDEX_FIELD_NUMBER: _ClassVar[int]
305
+ values: _containers.RepeatedScalarFieldContainer[float]
306
+ index: int
307
+ def __init__(self, values: _Optional[_Iterable[float]] = ..., index: _Optional[int] = ...) -> None: ...
308
+
309
+ class EmbedError(_message.Message):
310
+ __slots__ = ("message", "code", "details")
311
+ MESSAGE_FIELD_NUMBER: _ClassVar[int]
312
+ CODE_FIELD_NUMBER: _ClassVar[int]
313
+ DETAILS_FIELD_NUMBER: _ClassVar[int]
314
+ message: str
315
+ code: str
316
+ details: str
317
+ def __init__(self, message: _Optional[str] = ..., code: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ...
318
+
319
+ class HealthCheckRequest(_message.Message):
320
+ __slots__ = ("tokenized",)
321
+ TOKENIZED_FIELD_NUMBER: _ClassVar[int]
322
+ tokenized: TokenizedInput
323
+ def __init__(self, tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ...) -> None: ...
324
+
325
+ class HealthCheckResponse(_message.Message):
326
+ __slots__ = ("healthy", "message")
327
+ HEALTHY_FIELD_NUMBER: _ClassVar[int]
328
+ MESSAGE_FIELD_NUMBER: _ClassVar[int]
329
+ healthy: bool
330
+ message: str
331
+ def __init__(self, healthy: bool = ..., message: _Optional[str] = ...) -> None: ...
332
+
333
+ class AbortRequest(_message.Message):
334
+ __slots__ = ("request_id", "reason")
335
+ REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
336
+ REASON_FIELD_NUMBER: _ClassVar[int]
337
+ request_id: str
338
+ reason: str
339
+ def __init__(self, request_id: _Optional[str] = ..., reason: _Optional[str] = ...) -> None: ...
340
+
341
+ class AbortResponse(_message.Message):
342
+ __slots__ = ("success", "message")
343
+ SUCCESS_FIELD_NUMBER: _ClassVar[int]
344
+ MESSAGE_FIELD_NUMBER: _ClassVar[int]
345
+ success: bool
346
+ message: str
347
+ def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ...
348
+
349
+ class LoadLoRARequest(_message.Message):
350
+ __slots__ = ("adapter_id", "adapter_path", "rank")
351
+ ADAPTER_ID_FIELD_NUMBER: _ClassVar[int]
352
+ ADAPTER_PATH_FIELD_NUMBER: _ClassVar[int]
353
+ RANK_FIELD_NUMBER: _ClassVar[int]
354
+ adapter_id: str
355
+ adapter_path: str
356
+ rank: int
357
+ def __init__(self, adapter_id: _Optional[str] = ..., adapter_path: _Optional[str] = ..., rank: _Optional[int] = ...) -> None: ...
358
+
359
+ class LoadLoRAResponse(_message.Message):
360
+ __slots__ = ("success", "adapter_id", "message")
361
+ SUCCESS_FIELD_NUMBER: _ClassVar[int]
362
+ ADAPTER_ID_FIELD_NUMBER: _ClassVar[int]
363
+ MESSAGE_FIELD_NUMBER: _ClassVar[int]
364
+ success: bool
365
+ adapter_id: str
366
+ message: str
367
+ def __init__(self, success: bool = ..., adapter_id: _Optional[str] = ..., message: _Optional[str] = ...) -> None: ...
368
+
369
+ class UnloadLoRARequest(_message.Message):
370
+ __slots__ = ("adapter_id",)
371
+ ADAPTER_ID_FIELD_NUMBER: _ClassVar[int]
372
+ adapter_id: str
373
+ def __init__(self, adapter_id: _Optional[str] = ...) -> None: ...
374
+
375
+ class UnloadLoRAResponse(_message.Message):
376
+ __slots__ = ("success", "message")
377
+ SUCCESS_FIELD_NUMBER: _ClassVar[int]
378
+ MESSAGE_FIELD_NUMBER: _ClassVar[int]
379
+ success: bool
380
+ message: str
381
+ def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ...
382
+
383
+ class UpdateWeightsRequest(_message.Message):
384
+ __slots__ = ("disk_path", "tensor_data", "remote_url", "weight_name")
385
+ DISK_PATH_FIELD_NUMBER: _ClassVar[int]
386
+ TENSOR_DATA_FIELD_NUMBER: _ClassVar[int]
387
+ REMOTE_URL_FIELD_NUMBER: _ClassVar[int]
388
+ WEIGHT_NAME_FIELD_NUMBER: _ClassVar[int]
389
+ disk_path: str
390
+ tensor_data: bytes
391
+ remote_url: str
392
+ weight_name: str
393
+ def __init__(self, disk_path: _Optional[str] = ..., tensor_data: _Optional[bytes] = ..., remote_url: _Optional[str] = ..., weight_name: _Optional[str] = ...) -> None: ...
394
+
395
+ class UpdateWeightsResponse(_message.Message):
396
+ __slots__ = ("success", "message")
397
+ SUCCESS_FIELD_NUMBER: _ClassVar[int]
398
+ MESSAGE_FIELD_NUMBER: _ClassVar[int]
399
+ success: bool
400
+ message: str
401
+ def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ...
402
+
403
+ class GetInternalStateRequest(_message.Message):
404
+ __slots__ = ("state_keys",)
405
+ STATE_KEYS_FIELD_NUMBER: _ClassVar[int]
406
+ state_keys: _containers.RepeatedScalarFieldContainer[str]
407
+ def __init__(self, state_keys: _Optional[_Iterable[str]] = ...) -> None: ...
408
+
409
+ class GetInternalStateResponse(_message.Message):
410
+ __slots__ = ("state",)
411
+ STATE_FIELD_NUMBER: _ClassVar[int]
412
+ state: _struct_pb2.Struct
413
+ def __init__(self, state: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ...) -> None: ...
414
+
415
+ class SetInternalStateRequest(_message.Message):
416
+ __slots__ = ("state",)
417
+ STATE_FIELD_NUMBER: _ClassVar[int]
418
+ state: _struct_pb2.Struct
419
+ def __init__(self, state: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ...) -> None: ...
420
+
421
+ class SetInternalStateResponse(_message.Message):
422
+ __slots__ = ("success", "message")
423
+ SUCCESS_FIELD_NUMBER: _ClassVar[int]
424
+ MESSAGE_FIELD_NUMBER: _ClassVar[int]
425
+ success: bool
426
+ message: str
427
+ def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ...
@@ -0,0 +1,236 @@
1
+ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
2
+ """Client and server classes corresponding to protobuf-defined services."""
3
+ import grpc
4
+ import warnings
5
+
6
+ from . import sglang_scheduler_pb2 as sglang__scheduler__pb2
7
+
8
+ GRPC_GENERATED_VERSION = '1.74.0'
9
+ GRPC_VERSION = grpc.__version__
10
+ _version_not_supported = False
11
+
12
+ try:
13
+ from grpc._utilities import first_version_is_lower
14
+ _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
15
+ except ImportError:
16
+ _version_not_supported = True
17
+
18
+ if _version_not_supported:
19
+ raise RuntimeError(
20
+ f'The grpc package installed is at version {GRPC_VERSION},'
21
+ + f' but the generated code in sglang_scheduler_pb2_grpc.py depends on'
22
+ + f' grpcio>={GRPC_GENERATED_VERSION}.'
23
+ + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
24
+ + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
25
+ )
26
+
27
+
28
+ class SglangSchedulerStub(object):
29
+ """Service definition for SGLang scheduler communication
30
+ This protocol bridges the Rust router and Python scheduler
31
+ """
32
+
33
+ def __init__(self, channel):
34
+ """Constructor.
35
+
36
+ Args:
37
+ channel: A grpc.Channel.
38
+ """
39
+ self.Generate = channel.unary_stream(
40
+ '/sglang.grpc.scheduler.SglangScheduler/Generate',
41
+ request_serializer=sglang__scheduler__pb2.GenerateRequest.SerializeToString,
42
+ response_deserializer=sglang__scheduler__pb2.GenerateResponse.FromString,
43
+ _registered_method=True)
44
+ self.Embed = channel.unary_unary(
45
+ '/sglang.grpc.scheduler.SglangScheduler/Embed',
46
+ request_serializer=sglang__scheduler__pb2.EmbedRequest.SerializeToString,
47
+ response_deserializer=sglang__scheduler__pb2.EmbedResponse.FromString,
48
+ _registered_method=True)
49
+ self.HealthCheck = channel.unary_unary(
50
+ '/sglang.grpc.scheduler.SglangScheduler/HealthCheck',
51
+ request_serializer=sglang__scheduler__pb2.HealthCheckRequest.SerializeToString,
52
+ response_deserializer=sglang__scheduler__pb2.HealthCheckResponse.FromString,
53
+ _registered_method=True)
54
+ self.Abort = channel.unary_unary(
55
+ '/sglang.grpc.scheduler.SglangScheduler/Abort',
56
+ request_serializer=sglang__scheduler__pb2.AbortRequest.SerializeToString,
57
+ response_deserializer=sglang__scheduler__pb2.AbortResponse.FromString,
58
+ _registered_method=True)
59
+
60
+
61
+ class SglangSchedulerServicer(object):
62
+ """Service definition for SGLang scheduler communication
63
+ This protocol bridges the Rust router and Python scheduler
64
+ """
65
+
66
+ def Generate(self, request, context):
67
+ """Submit a generation request (supports streaming)
68
+ """
69
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
70
+ context.set_details('Method not implemented!')
71
+ raise NotImplementedError('Method not implemented!')
72
+
73
+ def Embed(self, request, context):
74
+ """Submit an embedding request
75
+ """
76
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
77
+ context.set_details('Method not implemented!')
78
+ raise NotImplementedError('Method not implemented!')
79
+
80
+ def HealthCheck(self, request, context):
81
+ """Health check and metrics
82
+ """
83
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
84
+ context.set_details('Method not implemented!')
85
+ raise NotImplementedError('Method not implemented!')
86
+
87
+ def Abort(self, request, context):
88
+ """Abort a running request
89
+ """
90
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
91
+ context.set_details('Method not implemented!')
92
+ raise NotImplementedError('Method not implemented!')
93
+
94
+
95
+ def add_SglangSchedulerServicer_to_server(servicer, server):
96
+ rpc_method_handlers = {
97
+ 'Generate': grpc.unary_stream_rpc_method_handler(
98
+ servicer.Generate,
99
+ request_deserializer=sglang__scheduler__pb2.GenerateRequest.FromString,
100
+ response_serializer=sglang__scheduler__pb2.GenerateResponse.SerializeToString,
101
+ ),
102
+ 'Embed': grpc.unary_unary_rpc_method_handler(
103
+ servicer.Embed,
104
+ request_deserializer=sglang__scheduler__pb2.EmbedRequest.FromString,
105
+ response_serializer=sglang__scheduler__pb2.EmbedResponse.SerializeToString,
106
+ ),
107
+ 'HealthCheck': grpc.unary_unary_rpc_method_handler(
108
+ servicer.HealthCheck,
109
+ request_deserializer=sglang__scheduler__pb2.HealthCheckRequest.FromString,
110
+ response_serializer=sglang__scheduler__pb2.HealthCheckResponse.SerializeToString,
111
+ ),
112
+ 'Abort': grpc.unary_unary_rpc_method_handler(
113
+ servicer.Abort,
114
+ request_deserializer=sglang__scheduler__pb2.AbortRequest.FromString,
115
+ response_serializer=sglang__scheduler__pb2.AbortResponse.SerializeToString,
116
+ ),
117
+ }
118
+ generic_handler = grpc.method_handlers_generic_handler(
119
+ 'sglang.grpc.scheduler.SglangScheduler', rpc_method_handlers)
120
+ server.add_generic_rpc_handlers((generic_handler,))
121
+ server.add_registered_method_handlers('sglang.grpc.scheduler.SglangScheduler', rpc_method_handlers)
122
+
123
+
124
+ # This class is part of an EXPERIMENTAL API.
125
+ class SglangScheduler(object):
126
+ """Service definition for SGLang scheduler communication
127
+ This protocol bridges the Rust router and Python scheduler
128
+ """
129
+
130
+ @staticmethod
131
+ def Generate(request,
132
+ target,
133
+ options=(),
134
+ channel_credentials=None,
135
+ call_credentials=None,
136
+ insecure=False,
137
+ compression=None,
138
+ wait_for_ready=None,
139
+ timeout=None,
140
+ metadata=None):
141
+ return grpc.experimental.unary_stream(
142
+ request,
143
+ target,
144
+ '/sglang.grpc.scheduler.SglangScheduler/Generate',
145
+ sglang__scheduler__pb2.GenerateRequest.SerializeToString,
146
+ sglang__scheduler__pb2.GenerateResponse.FromString,
147
+ options,
148
+ channel_credentials,
149
+ insecure,
150
+ call_credentials,
151
+ compression,
152
+ wait_for_ready,
153
+ timeout,
154
+ metadata,
155
+ _registered_method=True)
156
+
157
+ @staticmethod
158
+ def Embed(request,
159
+ target,
160
+ options=(),
161
+ channel_credentials=None,
162
+ call_credentials=None,
163
+ insecure=False,
164
+ compression=None,
165
+ wait_for_ready=None,
166
+ timeout=None,
167
+ metadata=None):
168
+ return grpc.experimental.unary_unary(
169
+ request,
170
+ target,
171
+ '/sglang.grpc.scheduler.SglangScheduler/Embed',
172
+ sglang__scheduler__pb2.EmbedRequest.SerializeToString,
173
+ sglang__scheduler__pb2.EmbedResponse.FromString,
174
+ options,
175
+ channel_credentials,
176
+ insecure,
177
+ call_credentials,
178
+ compression,
179
+ wait_for_ready,
180
+ timeout,
181
+ metadata,
182
+ _registered_method=True)
183
+
184
+ @staticmethod
185
+ def HealthCheck(request,
186
+ target,
187
+ options=(),
188
+ channel_credentials=None,
189
+ call_credentials=None,
190
+ insecure=False,
191
+ compression=None,
192
+ wait_for_ready=None,
193
+ timeout=None,
194
+ metadata=None):
195
+ return grpc.experimental.unary_unary(
196
+ request,
197
+ target,
198
+ '/sglang.grpc.scheduler.SglangScheduler/HealthCheck',
199
+ sglang__scheduler__pb2.HealthCheckRequest.SerializeToString,
200
+ sglang__scheduler__pb2.HealthCheckResponse.FromString,
201
+ options,
202
+ channel_credentials,
203
+ insecure,
204
+ call_credentials,
205
+ compression,
206
+ wait_for_ready,
207
+ timeout,
208
+ metadata,
209
+ _registered_method=True)
210
+
211
+ @staticmethod
212
+ def Abort(request,
213
+ target,
214
+ options=(),
215
+ channel_credentials=None,
216
+ call_credentials=None,
217
+ insecure=False,
218
+ compression=None,
219
+ wait_for_ready=None,
220
+ timeout=None,
221
+ metadata=None):
222
+ return grpc.experimental.unary_unary(
223
+ request,
224
+ target,
225
+ '/sglang.grpc.scheduler.SglangScheduler/Abort',
226
+ sglang__scheduler__pb2.AbortRequest.SerializeToString,
227
+ sglang__scheduler__pb2.AbortResponse.FromString,
228
+ options,
229
+ channel_credentials,
230
+ insecure,
231
+ call_credentials,
232
+ compression,
233
+ wait_for_ready,
234
+ timeout,
235
+ metadata,
236
+ _registered_method=True)
@@ -38,10 +38,12 @@ from sglang.srt.configs import (
38
38
  ChatGLMConfig,
39
39
  DbrxConfig,
40
40
  DeepseekVL2Config,
41
+ DotsVLMConfig,
41
42
  ExaoneConfig,
42
43
  KimiVLConfig,
43
44
  LongcatFlashConfig,
44
45
  MultiModalityConfig,
46
+ Qwen3NextConfig,
45
47
  Step3VLConfig,
46
48
  )
47
49
  from sglang.srt.configs.internvl import InternVLChatConfig
@@ -58,6 +60,8 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
58
60
  InternVLChatConfig.model_type: InternVLChatConfig,
59
61
  Step3VLConfig.model_type: Step3VLConfig,
60
62
  LongcatFlashConfig.model_type: LongcatFlashConfig,
63
+ Qwen3NextConfig.model_type: Qwen3NextConfig,
64
+ DotsVLMConfig.model_type: DotsVLMConfig,
61
65
  }
62
66
 
63
67
  for name, cls in _CONFIG_REGISTRY.items():