sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/lang/interpreter.py +1 -1
  7. sglang/launch_server.py +14 -0
  8. sglang/profiler.py +2 -2
  9. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  10. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  11. sglang/srt/configs/__init__.py +8 -0
  12. sglang/srt/configs/device_config.py +3 -1
  13. sglang/srt/configs/dots_ocr.py +64 -0
  14. sglang/srt/configs/dots_vlm.py +139 -0
  15. sglang/srt/configs/falcon_h1.py +360 -0
  16. sglang/srt/configs/internvl.py +6 -0
  17. sglang/srt/configs/load_config.py +9 -0
  18. sglang/srt/configs/model_config.py +181 -82
  19. sglang/srt/configs/qwen3_next.py +326 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +71 -19
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +326 -53
  44. sglang/srt/disaggregation/prefill.py +36 -17
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +192 -113
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  52. sglang/srt/entrypoints/grpc_server.py +810 -0
  53. sglang/srt/entrypoints/http_server.py +132 -57
  54. sglang/srt/entrypoints/openai/protocol.py +115 -7
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +207 -58
  57. sglang/srt/entrypoints/openai/serving_completions.py +17 -4
  58. sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +49 -4
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/environ.py +285 -0
  63. sglang/srt/eplb/eplb_manager.py +2 -2
  64. sglang/srt/eplb/expert_distribution.py +26 -13
  65. sglang/srt/eplb/expert_location.py +38 -8
  66. sglang/srt/eplb/expert_location_updater.py +1 -1
  67. sglang/srt/function_call/base_format_detector.py +3 -6
  68. sglang/srt/function_call/ebnf_composer.py +11 -9
  69. sglang/srt/function_call/function_call_parser.py +9 -2
  70. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  71. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  72. sglang/srt/function_call/json_array_parser.py +63 -0
  73. sglang/srt/function_call/kimik2_detector.py +17 -4
  74. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  75. sglang/srt/function_call/utils.py +96 -5
  76. sglang/srt/grpc/__init__.py +1 -0
  77. sglang/srt/grpc/compile_proto.py +245 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  81. sglang/srt/layers/activation.py +143 -9
  82. sglang/srt/layers/attention/aiter_backend.py +106 -82
  83. sglang/srt/layers/attention/ascend_backend.py +115 -9
  84. sglang/srt/layers/attention/attention_registry.py +206 -0
  85. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  86. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  87. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  88. sglang/srt/layers/attention/fla/chunk.py +242 -0
  89. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  90. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  91. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  92. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  93. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  94. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  95. sglang/srt/layers/attention/fla/index.py +37 -0
  96. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  97. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  98. sglang/srt/layers/attention/fla/op.py +66 -0
  99. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  100. sglang/srt/layers/attention/fla/utils.py +331 -0
  101. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  102. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  103. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  104. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  105. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  106. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  107. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  108. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  109. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  111. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  112. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  113. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  114. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  115. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  121. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  122. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  123. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  124. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  125. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  126. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  127. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  128. sglang/srt/layers/attention/nsa/utils.py +24 -0
  129. sglang/srt/layers/attention/nsa_backend.py +887 -0
  130. sglang/srt/layers/attention/tbo_backend.py +6 -6
  131. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  132. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  133. sglang/srt/layers/attention/triton_backend.py +57 -7
  134. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  135. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  136. sglang/srt/layers/attention/vision.py +58 -0
  137. sglang/srt/layers/attention/wave_backend.py +4 -4
  138. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  139. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  140. sglang/srt/layers/communicator.py +53 -7
  141. sglang/srt/layers/dp_attention.py +41 -2
  142. sglang/srt/layers/elementwise.py +3 -1
  143. sglang/srt/layers/layernorm.py +34 -15
  144. sglang/srt/layers/linear.py +55 -7
  145. sglang/srt/layers/logits_processor.py +44 -12
  146. sglang/srt/layers/moe/__init__.py +2 -1
  147. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  148. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  149. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  150. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  151. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  167. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  169. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  170. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  171. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  172. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  173. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  174. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  175. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  176. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  177. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  178. sglang/srt/layers/moe/topk.py +30 -9
  179. sglang/srt/layers/moe/utils.py +22 -7
  180. sglang/srt/layers/parameter.py +23 -6
  181. sglang/srt/layers/quantization/awq.py +19 -7
  182. sglang/srt/layers/quantization/base_config.py +11 -6
  183. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  184. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  185. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  186. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  187. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  188. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  189. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  190. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  191. sglang/srt/layers/quantization/fp8.py +78 -49
  192. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  193. sglang/srt/layers/quantization/gptq.py +25 -17
  194. sglang/srt/layers/quantization/modelopt_quant.py +225 -57
  195. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  196. sglang/srt/layers/quantization/mxfp4.py +77 -42
  197. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  199. sglang/srt/layers/quantization/quark/utils.py +97 -0
  200. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  201. sglang/srt/layers/quantization/unquant.py +135 -47
  202. sglang/srt/layers/quantization/w4afp8.py +26 -17
  203. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  204. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  205. sglang/srt/layers/rocm_linear_utils.py +44 -0
  206. sglang/srt/layers/rotary_embedding.py +78 -49
  207. sglang/srt/layers/sampler.py +213 -21
  208. sglang/srt/layers/utils.py +23 -0
  209. sglang/srt/lora/backend/base_backend.py +50 -8
  210. sglang/srt/lora/backend/chunked_backend.py +348 -0
  211. sglang/srt/lora/backend/triton_backend.py +99 -5
  212. sglang/srt/lora/layers.py +32 -0
  213. sglang/srt/lora/lora.py +8 -3
  214. sglang/srt/lora/lora_manager.py +52 -118
  215. sglang/srt/lora/mem_pool.py +25 -11
  216. sglang/srt/lora/triton_ops/__init__.py +4 -0
  217. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  219. sglang/srt/lora/utils.py +22 -11
  220. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  221. sglang/srt/managers/cache_controller.py +215 -314
  222. sglang/srt/managers/data_parallel_controller.py +115 -80
  223. sglang/srt/managers/detokenizer_manager.py +19 -15
  224. sglang/srt/managers/disagg_service.py +46 -0
  225. sglang/srt/managers/io_struct.py +340 -109
  226. sglang/srt/managers/mm_utils.py +44 -6
  227. sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
  228. sglang/srt/managers/multimodal_processor.py +1 -2
  229. sglang/srt/managers/overlap_utils.py +53 -0
  230. sglang/srt/managers/schedule_batch.py +240 -138
  231. sglang/srt/managers/schedule_policy.py +147 -19
  232. sglang/srt/managers/scheduler.py +501 -304
  233. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  234. sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
  235. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  236. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  237. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  238. sglang/srt/managers/template_manager.py +3 -3
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +321 -632
  241. sglang/srt/managers/tp_worker.py +81 -22
  242. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  243. sglang/srt/managers/utils.py +1 -45
  244. sglang/srt/mem_cache/allocator.py +15 -21
  245. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  246. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  247. sglang/srt/mem_cache/chunk_cache.py +8 -1
  248. sglang/srt/mem_cache/evict_policy.py +23 -0
  249. sglang/srt/mem_cache/hicache_storage.py +58 -34
  250. sglang/srt/mem_cache/hiradix_cache.py +227 -80
  251. sglang/srt/mem_cache/memory_pool.py +535 -58
  252. sglang/srt/mem_cache/memory_pool_host.py +239 -223
  253. sglang/srt/mem_cache/radix_cache.py +222 -73
  254. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  255. sglang/srt/mem_cache/storage/__init__.py +10 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  257. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  258. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  259. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  260. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  261. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  262. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  263. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
  264. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  265. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  266. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
  267. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  268. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  269. sglang/srt/metrics/collector.py +519 -132
  270. sglang/srt/metrics/func_timer.py +2 -7
  271. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  272. sglang/srt/metrics/utils.py +55 -0
  273. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  274. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  275. sglang/srt/model_executor/forward_batch_info.py +98 -57
  276. sglang/srt/model_executor/model_runner.py +433 -158
  277. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  278. sglang/srt/model_loader/__init__.py +9 -3
  279. sglang/srt/model_loader/loader.py +133 -5
  280. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  281. sglang/srt/model_loader/weight_utils.py +158 -3
  282. sglang/srt/models/apertus.py +686 -0
  283. sglang/srt/models/bailing_moe.py +820 -217
  284. sglang/srt/models/bailing_moe_nextn.py +168 -0
  285. sglang/srt/models/deepseek_nextn.py +6 -1
  286. sglang/srt/models/deepseek_v2.py +833 -152
  287. sglang/srt/models/dots_ocr.py +173 -0
  288. sglang/srt/models/dots_vlm.py +174 -0
  289. sglang/srt/models/dots_vlm_vit.py +337 -0
  290. sglang/srt/models/ernie4.py +1 -1
  291. sglang/srt/models/falcon_h1.py +576 -0
  292. sglang/srt/models/gemma3_causal.py +0 -2
  293. sglang/srt/models/gemma3_mm.py +1 -1
  294. sglang/srt/models/gemma3n_mm.py +2 -2
  295. sglang/srt/models/glm4_moe.py +14 -5
  296. sglang/srt/models/glm4_moe_nextn.py +2 -2
  297. sglang/srt/models/glm4v.py +5 -3
  298. sglang/srt/models/glm4v_moe.py +4 -1
  299. sglang/srt/models/gpt_oss.py +8 -31
  300. sglang/srt/models/internvl.py +28 -0
  301. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  302. sglang/srt/models/llama.py +4 -0
  303. sglang/srt/models/llama4.py +9 -0
  304. sglang/srt/models/llama_eagle3.py +13 -0
  305. sglang/srt/models/longcat_flash.py +3 -3
  306. sglang/srt/models/longcat_flash_nextn.py +1 -1
  307. sglang/srt/models/minicpmv.py +165 -3
  308. sglang/srt/models/mllama4.py +40 -4
  309. sglang/srt/models/opt.py +637 -0
  310. sglang/srt/models/qwen2_5_vl.py +29 -5
  311. sglang/srt/models/qwen2_audio.py +1 -1
  312. sglang/srt/models/qwen2_moe.py +124 -14
  313. sglang/srt/models/qwen2_vl.py +1 -1
  314. sglang/srt/models/qwen3.py +26 -5
  315. sglang/srt/models/qwen3_moe.py +71 -12
  316. sglang/srt/models/qwen3_next.py +1069 -0
  317. sglang/srt/models/qwen3_next_mtp.py +112 -0
  318. sglang/srt/models/qwen3_vl.py +787 -0
  319. sglang/srt/models/qwen3_vl_moe.py +471 -0
  320. sglang/srt/models/registry.py +15 -3
  321. sglang/srt/models/sarashina2_vision.py +269 -0
  322. sglang/srt/models/solar.py +505 -0
  323. sglang/srt/models/starcoder2.py +357 -0
  324. sglang/srt/models/step3_vl.py +1 -1
  325. sglang/srt/models/torch_native_llama.py +10 -3
  326. sglang/srt/models/utils.py +51 -0
  327. sglang/srt/multimodal/processors/base_processor.py +15 -7
  328. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  329. sglang/srt/multimodal/processors/glm4v.py +9 -9
  330. sglang/srt/multimodal/processors/internvl.py +153 -129
  331. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  332. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  333. sglang/srt/offloader.py +27 -3
  334. sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
  335. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  336. sglang/srt/sampling/sampling_batch_info.py +38 -17
  337. sglang/srt/sampling/sampling_params.py +7 -0
  338. sglang/srt/server_args.py +1030 -254
  339. sglang/srt/server_args_config_parser.py +146 -0
  340. sglang/srt/single_batch_overlap.py +151 -0
  341. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  342. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  343. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  344. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  345. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  346. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  347. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  348. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  349. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  350. sglang/srt/speculative/eagle_worker.py +253 -136
  351. sglang/srt/speculative/ngram_utils.py +428 -0
  352. sglang/srt/speculative/ngram_worker.py +245 -0
  353. sglang/srt/speculative/spec_info.py +52 -0
  354. sglang/srt/speculative/spec_utils.py +606 -0
  355. sglang/srt/speculative/standalone_worker.py +109 -0
  356. sglang/srt/torch_memory_saver_adapter.py +5 -7
  357. sglang/srt/tracing/trace.py +578 -0
  358. sglang/srt/two_batch_overlap.py +8 -5
  359. sglang/srt/utils/__init__.py +2 -0
  360. sglang/srt/{utils.py → utils/common.py} +445 -77
  361. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  362. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  363. sglang/srt/utils/rpd_utils.py +452 -0
  364. sglang/srt/utils/slow_rank_detector.py +71 -0
  365. sglang/srt/warmup.py +8 -4
  366. sglang/srt/weight_sync/utils.py +2 -2
  367. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  368. sglang/test/few_shot_gsm8k.py +1 -0
  369. sglang/test/get_logits_ut.py +57 -0
  370. sglang/test/run_eval.py +79 -11
  371. sglang/test/runners.py +5 -1
  372. sglang/test/simple_eval_common.py +5 -2
  373. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  374. sglang/test/test_block_fp8.py +2 -2
  375. sglang/test/test_cutlass_moe.py +24 -6
  376. sglang/test/test_deterministic.py +297 -0
  377. sglang/test/test_disaggregation_utils.py +77 -0
  378. sglang/test/test_fp4_moe.py +370 -1
  379. sglang/test/test_programs.py +1 -1
  380. sglang/test/test_utils.py +383 -5
  381. sglang/utils.py +22 -1
  382. sglang/version.py +1 -1
  383. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  384. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
  385. sglang/srt/disaggregation/launch_lb.py +0 -118
  386. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  387. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  388. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  389. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  390. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  391. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  392. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  393. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  394. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  395. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,21 @@ import logging
5
5
  import os
6
6
  import signal
7
7
  import threading
8
+ import time
8
9
  from abc import ABC, abstractmethod
9
10
  from functools import wraps
10
11
  from typing import Any, List, Optional, Tuple
11
12
 
12
13
  import torch
13
14
 
14
- from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
15
- from sglang.srt.mem_cache.storage.hf3fs.client_hf3fs import Hf3fsClient
15
+ from sglang.srt.mem_cache.hicache_storage import (
16
+ HiCacheStorage,
17
+ HiCacheStorageConfig,
18
+ HiCacheStorageExtraInfo,
19
+ )
20
+ from sglang.srt.mem_cache.memory_pool_host import HostKVCache
21
+ from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsClient
22
+ from sglang.srt.metrics.collector import StorageMetrics
16
23
 
17
24
  logger = logging.getLogger(__name__)
18
25
 
@@ -112,6 +119,33 @@ def synchronized():
112
119
  return _decorator
113
120
 
114
121
 
122
+ def create_hf3fs_client(
123
+ path: str, size: int, bytes_per_page: int, entries: int, use_mock: bool = False
124
+ ) -> Hf3fsClient:
125
+ """Factory function to create appropriate HF3FS client.
126
+
127
+ Args:
128
+ path: File path for storage
129
+ size: Total size of storage file
130
+ bytes_per_page: Bytes per page
131
+ entries: Number of entries for batch operations
132
+ use_mock: Whether to use mock client instead of real usrbio client
133
+
134
+ Returns:
135
+ """
136
+ if use_mock:
137
+ from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsMockClient
138
+
139
+ logger.info(f"[Rank Using Hf3fsMockClient for testing")
140
+ return Hf3fsMockClient(path, size, bytes_per_page, entries)
141
+ else:
142
+ from sglang.srt.mem_cache.storage.hf3fs.hf3fs_usrbio_client import (
143
+ Hf3fsUsrBioClient,
144
+ )
145
+
146
+ return Hf3fsUsrBioClient(path, size, bytes_per_page, entries)
147
+
148
+
115
149
  class HiCacheHF3FS(HiCacheStorage):
116
150
  """HiCache backend that stores KV cache pages in HF3FS files."""
117
151
 
@@ -128,16 +162,20 @@ class HiCacheHF3FS(HiCacheStorage):
128
162
  dtype: torch.dtype,
129
163
  metadata_client: Hf3fsMetadataInterface,
130
164
  is_mla_model: bool = False,
165
+ is_page_first_layout: bool = False,
166
+ use_mock_client: bool = False,
131
167
  ):
132
168
  self.rank = rank
133
169
  self.file_path = file_path
134
170
  self.file_size = file_size
135
171
  self.numjobs = numjobs
136
172
  self.bytes_per_page = bytes_per_page
173
+ self.gb_per_page = bytes_per_page / (1 << 30)
137
174
  self.entries = entries
138
175
  self.dtype = dtype
139
176
  self.metadata_client = metadata_client
140
177
  self.is_mla_model = is_mla_model
178
+ self.is_page_first_layout = is_page_first_layout
141
179
  self.numel = self.bytes_per_page // self.dtype.itemsize
142
180
  self.num_pages = self.file_size // self.bytes_per_page
143
181
  self.skip_backup = False
@@ -145,17 +183,24 @@ class HiCacheHF3FS(HiCacheStorage):
145
183
  self.skip_backup = True
146
184
  self.rank = 0
147
185
 
186
+ self.is_zero_copy = False
187
+
148
188
  logger.info(
149
189
  f"[Rank {self.rank}] HiCacheHF3FS Client Initializing: "
150
190
  f"file_path={self.file_path}, "
151
191
  f"file_size={self.file_size / (2 ** 30):.2f} GB, "
152
- f"num_pages={self.num_pages}"
192
+ f"num_pages={self.num_pages}, "
193
+ f"is_mla_model={self.is_mla_model}"
153
194
  )
154
195
 
155
196
  self.ac = AtomicCounter(self.numjobs)
156
197
  self.clients = [
157
- Hf3fsClient(
158
- self.file_path, self.file_size, self.bytes_per_page, self.entries
198
+ create_hf3fs_client(
199
+ self.file_path,
200
+ self.file_size,
201
+ self.bytes_per_page,
202
+ self.entries,
203
+ use_mock_client,
159
204
  )
160
205
  for _ in range(numjobs)
161
206
  ]
@@ -172,6 +217,11 @@ class HiCacheHF3FS(HiCacheStorage):
172
217
  signal.signal(signal.SIGTERM, lambda sig, frame: self.close())
173
218
  signal.signal(signal.SIGQUIT, lambda sig, frame: self.close())
174
219
 
220
+ self.prefetch_pgs = []
221
+ self.backup_pgs = []
222
+ self.prefetch_bandwidth = []
223
+ self.backup_bandwidth = []
224
+
175
225
  @staticmethod
176
226
  def from_env_config(
177
227
  bytes_per_page: int,
@@ -192,10 +242,24 @@ class HiCacheHF3FS(HiCacheStorage):
192
242
  Hf3fsLocalMetadataClient,
193
243
  )
194
244
 
245
+ use_mock_client = False
195
246
  if storage_config is not None:
196
- rank, is_mla_model = storage_config.tp_rank, storage_config.is_mla_model
247
+ rank, is_mla_model, is_page_first_layout = (
248
+ storage_config.tp_rank,
249
+ storage_config.is_mla_model,
250
+ storage_config.is_page_first_layout,
251
+ )
252
+
253
+ if storage_config.extra_config is not None:
254
+ use_mock_client = storage_config.extra_config.get(
255
+ "use_mock_hf3fs_client", False
256
+ )
197
257
  else:
198
- rank, is_mla_model = 0, False
258
+ rank, is_mla_model, is_page_first_layout = (
259
+ 0,
260
+ False,
261
+ False,
262
+ )
199
263
 
200
264
  mla_unsupported_msg = f"MLA model is not supported without global metadata server, please refer to https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md"
201
265
 
@@ -213,6 +277,8 @@ class HiCacheHF3FS(HiCacheStorage):
213
277
  entries=8,
214
278
  dtype=dtype,
215
279
  metadata_client=Hf3fsLocalMetadataClient(),
280
+ is_page_first_layout=is_page_first_layout,
281
+ use_mock_client=use_mock_client,
216
282
  )
217
283
 
218
284
  try:
@@ -261,27 +327,16 @@ class HiCacheHF3FS(HiCacheStorage):
261
327
  dtype=dtype,
262
328
  metadata_client=metadata_client,
263
329
  is_mla_model=is_mla_model,
330
+ is_page_first_layout=is_page_first_layout,
331
+ use_mock_client=use_mock_client,
264
332
  )
265
333
 
266
- def get(
267
- self,
268
- key: str,
269
- target_location: Optional[Any] = None,
270
- target_sizes: Optional[Any] = None,
271
- ) -> torch.Tensor | None:
272
- return self.batch_get(
273
- [key],
274
- [target_location] if target_location is not None else None,
275
- [target_sizes] if target_sizes is not None else None,
276
- )[0]
277
-
278
334
  @synchronized()
279
- def batch_get(
335
+ def _batch_get(
280
336
  self,
281
337
  keys: List[str],
282
- target_locations: Optional[Any] = None,
283
- target_sizes: Optional[Any] = None,
284
- ) -> List[torch.Tensor | None]:
338
+ values: List[torch.Tensor],
339
+ ) -> List[bool]:
285
340
  page_indices = self.metadata_client.get_page_indices(self.rank, keys)
286
341
 
287
342
  batch_indices, file_offsets = [], []
@@ -290,15 +345,11 @@ class HiCacheHF3FS(HiCacheStorage):
290
345
  batch_indices.append(i)
291
346
  file_offsets.append(page_index * self.bytes_per_page)
292
347
 
293
- if target_locations is not None:
294
- for target_location in target_locations:
295
- assert target_location.is_contiguous()
296
- file_results = target_locations
297
- else:
298
- file_results = [
299
- torch.empty(self.numel, dtype=self.dtype)
300
- for _ in range(len(batch_indices))
301
- ]
348
+ for target_location in values:
349
+ assert target_location.is_contiguous()
350
+ file_results = values
351
+
352
+ start_time = time.perf_counter()
302
353
 
303
354
  futures = [
304
355
  self.executor.submit(
@@ -310,12 +361,17 @@ class HiCacheHF3FS(HiCacheStorage):
310
361
  ]
311
362
  read_results = [result for future in futures for result in future.result()]
312
363
 
313
- results = [None] * len(keys)
314
- for batch_index, file_result, read_result in zip(
315
- batch_indices, file_results, read_results
316
- ):
364
+ end_time = time.perf_counter()
365
+ ionum = len(batch_indices)
366
+ self.prefetch_pgs.append(ionum)
367
+ self.prefetch_bandwidth.append(
368
+ ionum / (end_time - start_time) * self.gb_per_page
369
+ )
370
+
371
+ results = [False] * len(keys)
372
+ for batch_index, read_result in zip(batch_indices, read_results):
317
373
  if read_result == self.bytes_per_page:
318
- results[batch_index] = file_result
374
+ results[batch_index] = True
319
375
  else:
320
376
  logger.error(
321
377
  f"[Rank {self.rank}] HiCacheHF3FS get {keys[batch_index]} failed"
@@ -323,27 +379,12 @@ class HiCacheHF3FS(HiCacheStorage):
323
379
 
324
380
  return results
325
381
 
326
- def set(
327
- self,
328
- key: str,
329
- value: Optional[Any] = None,
330
- target_location: Optional[Any] = None,
331
- target_sizes: Optional[Any] = None,
332
- ) -> bool:
333
- return self.batch_set(
334
- [key],
335
- [value] if value is not None else None,
336
- [target_location] if target_location is not None else None,
337
- [target_sizes] if target_sizes is not None else None,
338
- )
339
-
340
- def batch_set(
382
+ @synchronized()
383
+ def _batch_set(
341
384
  self,
342
385
  keys: List[str],
343
386
  values: Optional[Any] = None,
344
- target_locations: Optional[Any] = None,
345
- target_sizes: Optional[Any] = None,
346
- ) -> bool:
387
+ ) -> List[bool]:
347
388
  # In MLA backend, only one rank needs to backup the KV cache
348
389
  if self.skip_backup:
349
390
  return True
@@ -366,6 +407,8 @@ class HiCacheHF3FS(HiCacheStorage):
366
407
  assert value.is_contiguous()
367
408
  file_values.append(value)
368
409
 
410
+ start_time = time.perf_counter()
411
+
369
412
  futures = [
370
413
  self.executor.submit(
371
414
  self.clients[self.ac.next()].batch_write,
@@ -380,6 +423,11 @@ class HiCacheHF3FS(HiCacheStorage):
380
423
  for result in future.result()
381
424
  ]
382
425
 
426
+ end_time = time.perf_counter()
427
+ ionum = len(batch_indices)
428
+ self.backup_pgs.append(ionum)
429
+ self.backup_bandwidth.append(ionum / (end_time - start_time) * self.gb_per_page)
430
+
383
431
  written_keys_to_confirm = []
384
432
  results = [index[0] for index in indices]
385
433
  for batch_index, write_result in zip(batch_indices, write_results):
@@ -397,7 +445,7 @@ class HiCacheHF3FS(HiCacheStorage):
397
445
  self.rank, written_keys_to_confirm, pages_to_release
398
446
  )
399
447
 
400
- return all(results)
448
+ return results
401
449
 
402
450
  def delete(self, key: str) -> None:
403
451
  self.metadata_client.delete_keys(self.rank, [key])
@@ -407,21 +455,25 @@ class HiCacheHF3FS(HiCacheStorage):
407
455
  return result[0] if result else False
408
456
 
409
457
  def batch_exists(self, keys: List[str]) -> int:
458
+ factor = 1
459
+ if self.is_zero_copy and not self.is_mla_model:
460
+ keys = self._get_mha_zero_copy_keys(keys)
461
+ factor = 2
462
+
410
463
  results = self.metadata_client.exists(self.rank, keys)
411
- for i in range(len(keys)):
412
- if not results[i]:
413
- return i
414
464
 
415
- return len(keys)
465
+ i = 0
466
+ while i < len(keys) and results[i]:
467
+ i += 1
416
468
 
417
- def clear(self) -> bool:
469
+ return i // factor
470
+
471
+ def clear(self) -> None:
418
472
  try:
419
473
  self.metadata_client.clear(self.rank)
420
474
  logger.info(f"Cleared HiCacheHF3FS for rank {self.rank}")
421
- return True
422
475
  except Exception as e:
423
476
  logger.error(f"Failed to clear HiCacheHF3FS: {e}")
424
- return False
425
477
 
426
478
  def close(self) -> None:
427
479
  try:
@@ -431,3 +483,156 @@ class HiCacheHF3FS(HiCacheStorage):
431
483
  except Exception as e:
432
484
  logger.error(f"close HiCacheHF3FS: {e}")
433
485
  logger.info("close HiCacheHF3FS")
486
+
487
+ @synchronized()
488
+ def get_stats(self):
489
+ storage_metrics = StorageMetrics()
490
+ storage_metrics.prefetch_pgs.extend(self.prefetch_pgs)
491
+ storage_metrics.backup_pgs.extend(self.backup_pgs)
492
+ storage_metrics.prefetch_bandwidth.extend(self.prefetch_bandwidth)
493
+ storage_metrics.backup_bandwidth.extend(self.backup_bandwidth)
494
+ self.prefetch_pgs.clear()
495
+ self.backup_pgs.clear()
496
+ self.prefetch_bandwidth.clear()
497
+ self.backup_bandwidth.clear()
498
+ return storage_metrics
499
+
500
+ def register_mem_pool_host(self, mem_pool_host: HostKVCache):
501
+ super().register_mem_pool_host(mem_pool_host)
502
+ self.is_zero_copy = self.mem_pool_host.layout == "page_first"
503
+ logger.info(f"{self.is_zero_copy=}")
504
+
505
+ def _get_mha_zero_copy_keys(self, keys: List[str]) -> List[str]:
506
+ _keys = []
507
+ for k in keys:
508
+ _keys.append(f"{k}-k")
509
+ _keys.append(f"{k}-v")
510
+ return _keys
511
+
512
+ def _get_mha_zero_copy_values(
513
+ self, values: List[torch.Tensor]
514
+ ) -> List[torch.Tensor]:
515
+ _values = []
516
+ for value in values:
517
+ _values.append(value[0])
518
+ _values.append(value[1])
519
+ return _values
520
+
521
+ def _batch_get_preprocess(self, keys, host_indices):
522
+ page_num = len(host_indices) // self.mem_pool_host.page_size
523
+ # host_indices to kv_buffer
524
+ flat = not self.is_zero_copy
525
+ values = (
526
+ [
527
+ self.mem_pool_host.get_data_page(
528
+ host_indices[i * self.mem_pool_host.page_size], flat=flat
529
+ )
530
+ for i in range(page_num)
531
+ ]
532
+ if self.is_zero_copy
533
+ else [
534
+ self.mem_pool_host.get_dummy_flat_data_page() for _ in range(page_num)
535
+ ]
536
+ )
537
+
538
+ if self.is_zero_copy and not self.is_mla_model:
539
+ keys = self._get_mha_zero_copy_keys(keys)
540
+ values = self._get_mha_zero_copy_values(values)
541
+
542
+ return keys, values
543
+
544
+ def _batch_get_postprocess(self, host_indices, values, results):
545
+ page_num = len(host_indices) // self.mem_pool_host.page_size
546
+
547
+ if self.is_zero_copy:
548
+ if not self.is_mla_model:
549
+ results = [
550
+ (results[2 * i] and results[2 * i + 1]) for i in range(page_num)
551
+ ]
552
+ results = results[:page_num]
553
+ return results
554
+
555
+ for i in range(page_num):
556
+ if not results[i]:
557
+ break
558
+ self.mem_pool_host.set_from_flat_data_page(
559
+ host_indices[i * self.mem_pool_host.page_size], values[i]
560
+ )
561
+
562
+ return results
563
+
564
+ def batch_get_v1(
565
+ self,
566
+ keys: List[str],
567
+ host_indices: torch.Tensor,
568
+ extra_info: Optional[HiCacheStorageExtraInfo] = None,
569
+ ) -> List[bool]:
570
+ keys, values = self._batch_get_preprocess(keys, host_indices)
571
+ results = self._batch_get(keys, values)
572
+ return self._batch_get_postprocess(host_indices, values, results)
573
+
574
+ def _batch_set_preprocess(self, keys, host_indices):
575
+ page_num = len(host_indices) // self.mem_pool_host.page_size
576
+ # host_indices to kv_buffer
577
+ flat = not self.is_zero_copy
578
+ values = [
579
+ self.mem_pool_host.get_data_page(
580
+ host_indices[i * self.mem_pool_host.page_size], flat=flat
581
+ )
582
+ for i in range(page_num)
583
+ ]
584
+
585
+ if self.is_zero_copy and not self.is_mla_model:
586
+ keys = self._get_mha_zero_copy_keys(keys)
587
+ values = self._get_mha_zero_copy_values(values)
588
+
589
+ return keys, values
590
+
591
+ def batch_set_v1(
592
+ self,
593
+ keys: List[str],
594
+ host_indices: torch.Tensor,
595
+ extra_info: Optional[HiCacheStorageExtraInfo] = None,
596
+ ) -> List[bool]:
597
+ len_keys = len(keys)
598
+ keys, values = self._batch_set_preprocess(keys, host_indices)
599
+ results = self._batch_set(keys, values)
600
+ return results
601
+
602
+ # Deprecated
603
+ def get(
604
+ self,
605
+ key: str,
606
+ target_location: Optional[Any] = None,
607
+ target_sizes: Optional[Any] = None,
608
+ ) -> torch.Tensor | None:
609
+ pass
610
+
611
+ # Deprecated
612
+ def batch_get(
613
+ self,
614
+ keys: List[str],
615
+ target_locations: Optional[Any] = None,
616
+ target_sizes: Optional[Any] = None,
617
+ ) -> List[torch.Tensor | None] | int:
618
+ pass
619
+
620
+ # Deprecated
621
+ def set(
622
+ self,
623
+ key: str,
624
+ value: Optional[Any] = None,
625
+ target_location: Optional[Any] = None,
626
+ target_sizes: Optional[Any] = None,
627
+ ) -> bool:
628
+ pass
629
+
630
+ # Deprecated
631
+ def batch_set(
632
+ self,
633
+ keys: List[str],
634
+ values: Optional[Any] = None,
635
+ target_locations: Optional[Any] = None,
636
+ target_sizes: Optional[Any] = None,
637
+ ) -> bool:
638
+ pass