PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +321 -31
sglang/bench_serving.py +10 -3
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/load_config.py +8 -0
sglang/srt/configs/model_config.py +160 -105
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/constrained/base_grammar_backend.py +1 -0
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +6 -4
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/common/conn.py +266 -98
sglang/srt/disaggregation/decode.py +50 -9
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/mooncake/conn.py +51 -541
sglang/srt/disaggregation/nixl/conn.py +148 -39
sglang/srt/disaggregation/prefill.py +31 -14
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +135 -80
sglang/srt/entrypoints/engine.py +23 -3
sglang/srt/entrypoints/grpc_request_manager.py +330 -55
sglang/srt/entrypoints/grpc_server.py +232 -102
sglang/srt/entrypoints/http_server.py +49 -9
sglang/srt/entrypoints/openai/protocol.py +110 -5
sglang/srt/entrypoints/openai/serving_base.py +25 -6
sglang/srt/entrypoints/openai/serving_chat.py +178 -49
sglang/srt/entrypoints/openai/serving_completions.py +5 -3
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +42 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/function_call/function_call_parser.py +3 -2
sglang/srt/function_call/glm4_moe_detector.py +3 -3
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
sglang/srt/layers/activation.py +7 -6
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +108 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +112 -194
sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
sglang/srt/layers/attention/mamba/mamba.py +566 -1
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +42 -9
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +11 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +2 -0
sglang/srt/layers/linear.py +21 -4
sglang/srt/layers/logits_processor.py +15 -2
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +147 -74
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
sglang/srt/layers/moe/utils.py +10 -0
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/fp8.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/modelopt_quant.py +44 -9
sglang/srt/layers/quantization/mxfp4.py +12 -4
sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
sglang/srt/layers/quantization/w4afp8.py +0 -4
sglang/srt/layers/quantization/w8a8_int8.py +15 -3
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +52 -4
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +10 -4
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +17 -6
sglang/srt/lora/mem_pool.py +1 -1
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +42 -142
sglang/srt/managers/data_parallel_controller.py +11 -46
sglang/srt/managers/detokenizer_manager.py +11 -11
sglang/srt/managers/io_struct.py +162 -118
sglang/srt/managers/mm_utils.py +43 -6
sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +167 -86
sglang/srt/managers/schedule_policy.py +143 -16
sglang/srt/managers/scheduler.py +359 -214
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
sglang/srt/managers/tokenizer_manager.py +84 -136
sglang/srt/managers/tp_worker.py +39 -29
sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +14 -20
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +40 -1
sglang/srt/mem_cache/hiradix_cache.py +119 -32
sglang/srt/mem_cache/memory_pool.py +188 -10
sglang/srt/mem_cache/memory_pool_host.py +134 -182
sglang/srt/mem_cache/radix_cache.py +222 -71
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
sglang/srt/mem_cache/swa_radix_cache.py +25 -34
sglang/srt/metrics/collector.py +82 -120
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +39 -32
sglang/srt/model_executor/forward_batch_info.py +23 -38
sglang/srt/model_executor/model_runner.py +131 -183
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/loader.py +14 -10
sglang/srt/model_loader/weight_utils.py +156 -2
sglang/srt/models/bailing_moe.py +27 -4
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +536 -153
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +3 -3
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +1 -1
sglang/srt/models/glm4v_moe.py +1 -1
sglang/srt/models/gpt_oss.py +7 -30
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/longcat_flash.py +1 -1
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mllama4.py +15 -4
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +2 -2
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +64 -1
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +31 -3
sglang/srt/models/qwen3_next.py +36 -9
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +2 -3
sglang/srt/multimodal/processors/internvl.py +20 -8
sglang/srt/multimodal/processors/qwen_vl.py +8 -1
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +20 -2
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +753 -295
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
sglang/srt/speculative/eagle_worker.py +57 -25
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +47 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +399 -74
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +1 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +12 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +355 -4
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -250,7 +250,7 @@ class HiCacheController:
         storage_backend: Optional[str] = None,
         prefetch_threshold: int = 256,
         model_name: Optional[str] = None,
-        storage_backend_extra_config: Optional[str] = None,
+        storage_backend_extra_config: Optional[dict] = None,
     ):
         self.mem_pool_device_allocator = token_to_kv_pool_allocator
         self.mem_pool_device = token_to_kv_pool_allocator.get_kvcache()
@@ -275,43 +275,17 @@ class HiCacheController:
                 and self.storage_config.tp_rank != 0
             )
-            if storage_backend == "file":
-                from sglang.srt.mem_cache.hicache_storage import HiCacheFile
+            # Use storage backend factory for dynamic backend creation
+            from sglang.srt.mem_cache.storage import StorageBackendFactory
-                self.storage_backend = HiCacheFile(self.storage_config)
-            elif storage_backend == "nixl":
-                from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
-                self.storage_backend = HiCacheNixl()
-            elif storage_backend == "mooncake":
-                from sglang.srt.mem_cache.storage.mooncake_store.mooncake_store import (
-                    MooncakeStore,
+            try:
+                self.storage_backend = StorageBackendFactory.create_backend(
+                    storage_backend, self.storage_config, self.mem_pool_host
                 )
+            except ValueError as e:
+                raise ValueError(f"Failed to create storage backend: {e}") from e
-                self.storage_backend = MooncakeStore(self.storage_config)
-                self.storage_backend.register_buffer(self.mem_pool_host.kv_buffer)
-                assert self.mem_pool_host.layout == "page_first"
-            elif storage_backend == "hf3fs":
-                from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import (
-                    HiCacheHF3FS,
-                )
-                if self.mem_pool_host.layout == "page_first":
-                    bytes_per_page = (
-                        mem_pool_host.get_ksize_per_token() * mem_pool_host.page_size
-                    )
-                elif self.mem_pool_host.layout == "layer_first":
-                    bytes_per_page = (
-                        mem_pool_host.get_size_per_token() * mem_pool_host.page_size
-                    )
-                dtype = mem_pool_host.dtype
-                self.storage_backend = HiCacheHF3FS.from_env_config(
-                    bytes_per_page, dtype, self.storage_config
-                )
-            else:
-                raise NotImplementedError(
-                    f"Unsupported storage backend: {storage_backend}"
-                )
+            self.storage_backend.register_mem_pool_host(self.mem_pool_host)
             self.enable_storage = True
             # todo: threshold policy for prefetching
@@ -335,18 +309,10 @@ class HiCacheController:
             # Select the get and set functions
             self.page_get_func = self._generic_page_get
             self.page_set_func = self._generic_page_set
-            self.batch_exists_func = self.storage_backend.batch_exists
-            self.is_3fs_zerocopy = (
-                self.storage_backend_type == "hf3fs"
-                and self.mem_pool_host.layout == "page_first"
-            )
-            if self.storage_backend_type == "mooncake":
-                self.page_get_func = self._mooncake_page_get
-                self.page_set_func = self._mooncake_page_set
-            elif self.is_3fs_zerocopy:
-                self.page_get_func = self._3fs_zero_copy_page_get
-                self.page_set_func = self._3fs_zero_copy_page_set
-                self.batch_exists_func = self._3fs_zero_copy_batch_exists
+            if self.storage_backend_type in ["hf3fs", "mooncake", "eic"]:
+                self.page_get_func = self._page_get_zero_copy
+                self.page_set_func = self._page_set_zero_copy
         self.device = self.mem_pool_device.device
         self.layer_num = self.mem_pool_device.layer_num
@@ -395,7 +361,7 @@ class HiCacheController:
     def _generate_storage_config(
         self,
         model_name: Optional[str] = None,
-        storage_backend_extra_config: Optional[str] = None,
+        storage_backend_extra_config: Optional[dict] = None,
     ):
         if is_dp_attention_enabled():
@@ -410,23 +376,13 @@ class HiCacheController:
         # Currently, AscendMLAPagedTokenToKVPool is the subclass of MLATokenToKVPool.
         is_mla_backend = isinstance(self.mem_pool_device, MLATokenToKVPool)
-        # Parse extra config JSON if provided
-        extra_config = None
-        if storage_backend_extra_config:
-            try:
-                import json
-                extra_config = json.loads(storage_backend_extra_config)
-            except Exception as e:
-                logger.error(f"Invalid backend extra config JSON: {e}")
         return HiCacheStorageConfig(
             tp_rank=self.tp_rank,
             tp_size=self.tp_size,
             is_mla_model=is_mla_backend,
             is_page_first_layout=self.mem_pool_host.layout == "page_first",
             model_name=model_name,
-            extra_config=extra_config,
+            extra_config=storage_backend_extra_config,
         )
     def reset(self):
@@ -470,7 +426,6 @@ class HiCacheController:
         host_indices = self.mem_pool_host.alloc(len(device_indices))
         if host_indices is None:
             return None
-        self.mem_pool_host.protect_write(host_indices)
         self.write_queue.append(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )
@@ -494,7 +449,6 @@ class HiCacheController:
             self.mem_pool_host.backup_from_device_all_layer(
                 self.mem_pool_device, host_indices, device_indices, self.io_backend
             )
-            self.mem_pool_host.complete_io(op.host_indices)
             finish_event.record()
             # NOTE: We must save the host indices and device indices here,
             # this is because we need to guarantee that these tensors are
@@ -518,7 +472,6 @@ class HiCacheController:
         device_indices = self.mem_pool_device_allocator.alloc(len(host_indices))
         if device_indices is None:
             return None
-        self.mem_pool_host.protect_load(host_indices)
         self.load_queue.append(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )
@@ -563,7 +516,6 @@ class HiCacheController:
                     self.io_backend,
                 )
                 producer_event.complete(i)
-            self.mem_pool_host.complete_io(op.host_indices)
             # NOTE: We must save the host indices and device indices here,
             # this is because we need to guarantee that these tensors are
             # still alive when the load stream is executing.
@@ -581,29 +533,16 @@ class HiCacheController:
         )
         return producer_id
-    def evict_device(
-        self, device_indices: torch.Tensor, host_indices: torch.Tensor
-    ) -> int:
-        if self.mem_pool_host.is_synced(host_indices):
-            self.mem_pool_device_allocator.free(device_indices)
-            self.mem_pool_host.update_backup(host_indices)
-            return len(device_indices)
-        else:
-            raise ValueError(
-                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
-            )
+    def evict_device(self, device_indices: torch.Tensor) -> int:
+        self.mem_pool_device_allocator.free(device_indices)
+        return len(device_indices)
     def evict_host(self, host_indices: torch.Tensor, backup_only: bool = True) -> int:
         if not backup_only:
             raise ValueError("Other eviction policies are not supported yet.")
-        if self.mem_pool_host.is_backup(host_indices):
-            self.mem_pool_host.free(host_indices)
-            return len(host_indices)
-        else:
-            raise ValueError(
-                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
-            )
+        self.mem_pool_host.free(host_indices)
+        return len(host_indices)
     def prefetch(
         self,
@@ -626,46 +565,25 @@ class HiCacheController:
         return operation.completed_tokens, operation.hash_value
     def append_host_mem_release(self, host_indices: torch.Tensor):
-        chunks = host_indices.split(self.mem_pool_host.page_size)
-        for chunk in chunks:
-            self.host_mem_release_queue.put(chunk)
-    def _3fs_zero_copy_batch_exists(self, batch_hashes):
-        _batch_hashes, _, factor = self.mem_pool_host.get_buffer_with_hash(batch_hashes)
-        hit_page_num = self.storage_backend.batch_exists(_batch_hashes) // factor
-        return hit_page_num
-    def _3fs_zero_copy_page_get(self, operation, hash_values, host_indices):
-        hashes, dsts, factor = self.mem_pool_host.get_buffer_with_hash(
-            hash_values, host_indices
-        )
-        page_data = self.storage_backend.batch_get(hashes, dsts)
-        if page_data:
-            inc = self.page_size * len(hashes) // factor
-            operation.increment(inc)
-        else:
-            logger.warning(
-                f"Prefetch operation {operation.request_id} failed to retrieve page {hashes}."
-            )
+        if host_indices.numel() == 0:
+            return
+        pages = host_indices.split(self.mem_pool_host.page_size)
+        for page in pages:
+            self.host_mem_release_queue.put(page)
-    def _mooncake_page_get(self, operation, hash_values, host_indices):
-        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
-            hash_values,
-            host_indices,
-            self.storage_config.tp_rank,
-        )
-        get_result = self.storage_backend.batch_get(
-            key_strs,
-            target_locations=buffer_ptrs,
-            target_sizes=buffer_sizes,
-        )
-        if get_result != len(hash_values):
-            logger.warning(
-                f"Prefetch operation {operation.request_id} failed or partially failed."
-            )
-        if get_result != 0:
-            operation.increment(get_result * self.page_size)
+    def _page_get_zero_copy(self, operation, hash_values, host_indices):
+        results = self.storage_backend.batch_get_v1(hash_values, host_indices)
+        inc = 0
+        for i in range(len(hash_values)):
+            if not results[i]:
+                logger.warning(
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {hash_values[i]}."
+                )
+                break
+            inc += self.page_size
+        operation.increment(inc)
+    # todo: deprecate
     def _generic_page_get(self, operation, hash_values, host_indices):
         dummy_page_dst = [
             self.mem_pool_host.get_dummy_flat_data_page() for _ in hash_values
@@ -755,7 +673,7 @@ class HiCacheController:
                     batch_tokens[i : i + self.page_size], last_hash
                 )
                 batch_hashes.append(last_hash)
-            hit_page_num = self.batch_exists_func(batch_hashes)
+            hit_page_num = self.storage_backend.batch_exists(batch_hashes)
             hash_value.extend(batch_hashes[:hit_page_num])
             storage_query_count += hit_page_num * self.page_size
             if hit_page_num < len(batch_hashes):
@@ -824,34 +742,16 @@ class HiCacheController:
         self.backup_queue.put(operation)
         return operation.id
-    # non-zero copy
+    # todo: deprecate
     def _generic_page_set(self, hash_values, host_indices) -> bool:
         data = [
-            self.mem_pool_host.get_flat_data_page(host_indices[i * self.page_size])
+            self.mem_pool_host.get_data_page(host_indices[i * self.page_size])
             for i in range(len(hash_values))
         ]
         return self.storage_backend.batch_set(hash_values, data)
-    # zero copy
-    def _mooncake_page_set(self, hash_values, host_indices) -> bool:
-        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
-            hash_values,
-            host_indices,
-            self.storage_config.tp_rank,
-        )
-        success = self.storage_backend.batch_set(
-            key_strs,
-            target_locations=buffer_ptrs,
-            target_sizes=buffer_sizes,
-        )
-        return success
-    # zero copy
-    def _3fs_zero_copy_page_set(self, hash_values, host_indices) -> bool:
-        hashes, dsts, _ = self.mem_pool_host.get_buffer_with_hash(
-            hash_values, host_indices
-        )
-        return self.storage_backend.batch_set(hashes, dsts)
+    def _page_set_zero_copy(self, hash_values, host_indices) -> bool:
+        return all(self.storage_backend.batch_set_v1(hash_values, host_indices))
     # Backup batch by batch
     def _page_backup(self, operation):

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -17,14 +17,11 @@ import faulthandler
 import logging
 import multiprocessing as mp
 import signal
-import struct
-import sys
 import threading
 import time
 from collections import deque
 from enum import Enum, auto
-from multiprocessing import shared_memory
-from typing import Dict, List
+from typing import List
 import psutil
 import setproctitle
@@ -39,7 +36,6 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.schedule_batch import Req
 from sglang.srt.managers.scheduler import run_scheduler_process
-from sglang.srt.managers.utils import DPBalanceMeta
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import (
@@ -108,15 +104,9 @@ class DPBudget:
 class DataParallelController:
     """A controller that dispatches requests to multiple data parallel workers."""
-    def __init__(
-        self,
-        server_args: ServerArgs,
-        port_args: PortArgs,
-        dp_balance_meta: DPBalanceMeta,
-    ) -> None:
+    def __init__(self, server_args: ServerArgs, port_args: PortArgs) -> None:
         # for dp balance
         self.global_balance_id = 0
-        self.balance_meta = dp_balance_meta
         # Parse args
         self.max_total_num_tokens = None
@@ -219,7 +209,9 @@ class DataParallelController:
                 args=(server_args, tmp_port_args, base_gpu_id, dp_rank, ready_event),
             )
             threads.append(thread)
-            base_gpu_id += server_args.tp_size * server_args.gpu_id_step
+            base_gpu_id += (
+                server_args.tp_size * server_args.pp_size * server_args.gpu_id_step
+            )
         # Free all sockets before starting the threads to launch TP workers
         for sock in sockets:
@@ -322,7 +314,6 @@ class DataParallelController:
                         pp_rank,
                         dp_rank,
                         writer,
-                        self.balance_meta,
                     ),
                 )
                 with memory_saver_adapter.configure_subprocess():
@@ -370,31 +361,11 @@ class DataParallelController:
         if self.maybe_external_dp_rank_routing(req):
             return
-        # This variable corresponds to the balance_id in TokenizedGenerateReqInput.
-        # We use it to to control the number of onfly tokens (requests dispatched to workers but not yet received).
-        def get_next_global_balance_id() -> int:
-            INT32_MAX = 2147483647
-            current_id = self.global_balance_id
-            self.global_balance_id = (self.global_balance_id + 1) % INT32_MAX
-            return current_id
-        req.dp_balance_id = get_next_global_balance_id()
-        with self.balance_meta.mutex:
-            # 1. local_tokens represents the tokens currently inferring on the worker,
-            #  while onfly refers to the requests dispatched by the dispatcher but not yet received by the scheduler.
-            onfly_info = self.balance_meta.get_shared_onfly()
-            local_tokens = self.balance_meta.get_shared_local_tokens()
-            total_tokens = [
-                local_token + sum(onfly_dict.values())
-                for local_token, onfly_dict in zip(local_tokens, onfly_info)
-            ]
-            target_worker = total_tokens.index(min(total_tokens))
-            onfly_info[target_worker][req.dp_balance_id] = len(req.input_ids)
-            # 2. write the new onfly info to the shm
-            self.balance_meta.set_shared_onfly_info(onfly_info)
-        # logger.info(f"dp workers {local_tokens=}, {onfly_info=}, {target_worker=}")
-        self.workers[target_worker].send_pyobj(req)
+        logger.warning(
+            "The 'minimum_tokens' load balancing method is deprecated for now and will introduced later."
+            "Fall back to 'round_robin_scheduler'"
+        )
+        self.round_robin_scheduler(req)
     def event_loop(self):
         while True:
@@ -416,12 +387,9 @@ def run_data_parallel_controller_process(
     faulthandler.enable()
     configure_logger(server_args)
     parent_process = psutil.Process().parent()
-    balance_meta = DPBalanceMeta(server_args.dp_size)
     try:
-        controller = DataParallelController(
-            server_args, port_args, dp_balance_meta=balance_meta
-        )
+        controller = DataParallelController(server_args, port_args)
         pipe_writer.send(
             {
                 "status": "ready",
@@ -440,6 +408,3 @@ def run_data_parallel_controller_process(
         traceback = get_exception_traceback()
         logger.error(f"DataParallelController hit an exception: {traceback}")
         parent_process.send_signal(signal.SIGQUIT)
-    finally:
-        # we need to destruct mp.Manager() in balance_meta
-        balance_meta.destructor()

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -24,13 +24,12 @@ import psutil
 import setproctitle
 import zmq
-from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.io_struct import (
-    BatchEmbeddingOut,
+    BatchEmbeddingOutput,
     BatchMultimodalDecodeReq,
-    BatchMultimodalOut,
-    BatchStrOut,
-    BatchTokenIDOut,
+    BatchMultimodalOutput,
+    BatchStrOutput,
+    BatchTokenIDOutput,
     FreezeGCReq,
     MultiTokenizerRegisterReq,
 )
@@ -42,6 +41,7 @@ from sglang.srt.utils import (
     get_zmq_socket,
     kill_itself_when_parent_died,
 )
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.utils import (
     TypeBasedDispatcher,
     find_printable_text,
@@ -101,8 +101,8 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
         self._request_dispatcher = TypeBasedDispatcher(
             [
-                (BatchEmbeddingOut, self.handle_batch_embedding_out),
-                (BatchTokenIDOut, self.handle_batch_token_id_out),
+                (BatchEmbeddingOutput, self.handle_batch_embedding_out),
+                (BatchTokenIDOutput, self.handle_batch_token_id_out),
                 (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
                 (MultiTokenizerRegisterReq, lambda x: x),
                 (FreezeGCReq, self.handle_freeze_gc_req),
@@ -145,11 +145,11 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
             return output[:-1]
         return output
-    def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOut):
+    def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOutput):
         # If it is embedding model, no detokenization is needed.
         return recv_obj
-    def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOut):
+    def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOutput):
         bs = len(recv_obj.rids)
         # Initialize decode status
@@ -224,7 +224,7 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
             s.sent_offset = len(output_str)
             output_strs.append(incremental_output)
-        return BatchStrOut(
+        return BatchStrOutput(
             rids=recv_obj.rids,
             finished_reasons=recv_obj.finished_reasons,
             output_strs=output_strs,
@@ -252,7 +252,7 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
     def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
         outputs = self.tokenizer.detokenize(recv_obj)
-        return BatchMultimodalOut(
+        return BatchMultimodalOutput(
             rids=recv_obj.rids,
             finished_reasons=recv_obj.finished_reasons,
             outputs=outputs,

sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl