PyPI - sglang - Versions diffs - 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl - Mend

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (419) hide show

sglang/bench_one_batch.py +47 -28
sglang/bench_one_batch_server.py +41 -25
sglang/bench_serving.py +378 -160
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +10 -15
sglang/profiler.py +18 -1
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +13 -64
sglang/srt/configs/load_config.py +25 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +136 -25
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +0 -10
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +20 -11
sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +4 -2
sglang/srt/disaggregation/decode.py +123 -31
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +157 -19
sglang/srt/disaggregation/nixl/conn.py +69 -24
sglang/srt/disaggregation/prefill.py +96 -270
sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +63 -19
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +83 -80
sglang/srt/entrypoints/grpc_server.py +430 -234
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +195 -102
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +225 -37
sglang/srt/entrypoints/openai/serving_base.py +49 -2
sglang/srt/entrypoints/openai/serving_chat.py +29 -74
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +15 -1
sglang/srt/entrypoints/openai/serving_responses.py +5 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +58 -6
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +33 -4
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +20 -14
sglang/srt/function_call/glm4_moe_detector.py +1 -5
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/json_array_parser.py +0 -2
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/function_call/utils.py +2 -2
sglang/srt/grpc/compile_proto.py +3 -3
sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
sglang/srt/layers/activation.py +10 -1
sglang/srt/layers/attention/aiter_backend.py +3 -3
sglang/srt/layers/attention/ascend_backend.py +17 -1
sglang/srt/layers/attention/attention_registry.py +43 -23
sglang/srt/layers/attention/base_attn_backend.py +20 -1
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +24 -10
sglang/srt/layers/attention/flashinfer_backend.py +258 -22
sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
sglang/srt/layers/attention/flashmla_backend.py +2 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +0 -1
sglang/srt/layers/attention/nsa_backend.py +404 -90
sglang/srt/layers/attention/triton_backend.py +208 -34
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
sglang/srt/layers/attention/utils.py +89 -7
sglang/srt/layers/attention/vision.py +3 -3
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +12 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +17 -0
sglang/srt/layers/layernorm.py +64 -19
sglang/srt/layers/linear.py +9 -1
sglang/srt/layers/logits_processor.py +152 -17
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
sglang/srt/layers/moe/ep_moe/layer.py +154 -625
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
sglang/srt/layers/moe/moe_runner/runner.py +6 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +7 -6
sglang/srt/layers/moe/utils.py +20 -5
sglang/srt/layers/quantization/__init__.py +5 -58
sglang/srt/layers/quantization/awq.py +183 -9
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +27 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +152 -81
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +42 -14
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +125 -100
sglang/srt/layers/quantization/mxfp4.py +35 -68
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +23 -48
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +87 -20
sglang/srt/layers/quantization/w8a8_int8.py +30 -24
sglang/srt/layers/radix_attention.py +62 -9
sglang/srt/layers/rotary_embedding.py +686 -17
sglang/srt/layers/sampler.py +47 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +0 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/triton_backend.py +0 -1
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora_manager.py +24 -9
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +40 -16
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
sglang/srt/managers/cache_controller.py +48 -17
sglang/srt/managers/data_parallel_controller.py +146 -42
sglang/srt/managers/detokenizer_manager.py +40 -13
sglang/srt/managers/io_struct.py +69 -16
sglang/srt/managers/mm_utils.py +20 -18
sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
sglang/srt/managers/overlap_utils.py +96 -19
sglang/srt/managers/schedule_batch.py +241 -511
sglang/srt/managers/schedule_policy.py +15 -2
sglang/srt/managers/scheduler.py +420 -514
sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
sglang/srt/managers/tokenizer_manager.py +375 -95
sglang/srt/managers/tp_worker.py +212 -161
sglang/srt/managers/utils.py +78 -2
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/allocator_ascend.py +2 -2
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +13 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +16 -1
sglang/srt/mem_cache/hicache_storage.py +11 -2
sglang/srt/mem_cache/hiradix_cache.py +16 -3
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +517 -219
sglang/srt/mem_cache/memory_pool_host.py +0 -1
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +53 -19
sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
sglang/srt/mem_cache/storage/backend_factory.py +2 -2
sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +92 -26
sglang/srt/metrics/collector.py +31 -0
sglang/srt/metrics/func_timer.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +43 -5
sglang/srt/model_executor/forward_batch_info.py +71 -25
sglang/srt/model_executor/model_runner.py +362 -270
sglang/srt/model_executor/npu_graph_runner.py +2 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +424 -27
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +47 -28
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +13 -52
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +19 -3
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +418 -140
sglang/srt/models/dots_ocr.py +0 -2
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +13 -19
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/gemma3n_mm.py +1 -2
sglang/srt/models/glm4_moe.py +327 -382
sglang/srt/models/glm4_moe_nextn.py +6 -16
sglang/srt/models/glm4v.py +2 -1
sglang/srt/models/glm4v_moe.py +32 -199
sglang/srt/models/gpt_oss.py +5 -5
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +3 -1
sglang/srt/models/llama.py +2 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +5 -22
sglang/srt/models/longcat_flash_nextn.py +3 -14
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +13 -3
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +22 -1
sglang/srt/models/qwen2_5_vl.py +3 -3
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +15 -12
sglang/srt/models/qwen2_vl.py +5 -2
sglang/srt/models/qwen3.py +34 -4
sglang/srt/models/qwen3_moe.py +19 -37
sglang/srt/models/qwen3_next.py +7 -12
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +37 -33
sglang/srt/models/qwen3_vl_moe.py +57 -185
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +0 -1
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/utils.py +11 -1
sglang/srt/multimodal/processors/base_processor.py +7 -2
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +0 -1
sglang/srt/multimodal/processors/glm4v.py +2 -6
sglang/srt/multimodal/processors/internvl.py +0 -2
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +75 -16
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/reasoning_parser.py +28 -2
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +17 -22
sglang/srt/sampling/sampling_params.py +70 -2
sglang/srt/server_args.py +846 -163
sglang/srt/server_args_config_parser.py +1 -1
sglang/srt/single_batch_overlap.py +36 -31
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
sglang/srt/speculative/eagle_info.py +57 -18
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +138 -0
sglang/srt/speculative/eagle_worker.py +83 -280
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_info.py +2 -0
sglang/srt/speculative/spec_utils.py +38 -3
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/two_batch_overlap.py +28 -14
sglang/srt/utils/__init__.py +1 -1
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/utils/common.py +272 -82
sglang/srt/utils/hf_transformers_utils.py +44 -17
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/utils/profile_merger.py +199 -0
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +41 -0
sglang/test/runners.py +2 -0
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +3 -0
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/test_block_fp8.py +1 -2
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +463 -107
sglang/test/test_deterministic_utils.py +74 -0
sglang/test/test_disaggregation_utils.py +81 -0
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_utils.py +85 -20
sglang/version.py +1 -1
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/srt/models/vila.py +0 -306
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0

sglang/test/test_deterministic.py CHANGED Viewed

@@ -2,7 +2,14 @@
 Batch the same prompt in random batch sizes, and test if the results are consistent across different trials.
 Usage:
-python3 -m sglang.test.test_deterministic --n-trials <numer_of_trials> --test-mode <single|mixed|prefix> --profile
+# Single mode: test determinism with varying batch sizes
+python3 -m sglang.test.test_deterministic --n-trials 50 --test-mode single
+# Prefix mode: test with shared prefixes
+python3 -m sglang.test.test_deterministic --n-start 1 --n-trials 50 --test-mode prefix
+# Radix Cache Consistency mode: test radix cache determinism (cached vs uncached prefill)
+python3 -m sglang.test.test_deterministic --test-mode radix_cache
 """
 import argparse
@@ -10,7 +17,7 @@ import dataclasses
 import json
 import os
 import random
-from typing import List
+from typing import Any, Dict, List, Optional
 import requests
@@ -39,12 +46,15 @@ class BenchArgs:
     profile_steps: int = 3
     profile_by_stage: bool = False
     test_mode: str = "single"
+    n_trials: int = 50
+    n_start: int = 1
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument("--host", type=str, default=BenchArgs.host)
         parser.add_argument("--port", type=int, default=BenchArgs.port)
-        parser.add_argument("--n-trials", type=int, default=50)
+        parser.add_argument("--n-trials", type=int, default=BenchArgs.n_trials)
+        parser.add_argument("--n-start", type=int, default=BenchArgs.n_start)
         parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
         parser.add_argument(
             "--sampling-seed", type=int, default=BenchArgs.sampling_seed
@@ -64,7 +74,12 @@ class BenchArgs:
             "--test-mode",
             type=str,
             default=BenchArgs.test_mode,
-            choices=["single", "mixed", "prefix"],
+            choices=[
+                "single",
+                "prefix",
+                "radix_cache",
+                "p_vs_d",
+            ],
         )
         parser.add_argument("--profile", action="store_true")
         parser.add_argument(
@@ -80,26 +95,55 @@ class BenchArgs:
 def send_single(
     args,
-    batch_size: int,
     profile: bool = False,
     profile_steps: int = 3,
     profile_by_stage: bool = False,
+    return_full_response: bool = False,
+    input_ids: List[int] = None,
+    prompt: List[str] = None,
+    max_new_tokens: int = None,
+    extra_params: Optional[Dict[str, Any]] = None,
+    pick_first_result: bool = True,
 ):
     base_url = f"http://{args.host}:{args.port}"
-    prompt = [PROMPT_1] * batch_size
-    json_data = {
-        "text": prompt,
-        "sampling_params": {
-            "temperature": args.temperature,
-            "max_new_tokens": args.max_new_tokens,
-            "frequency_penalty": args.frequency_penalty,
-            "presence_penalty": args.presence_penalty,
-        },
-        "return_logprob": args.return_logprob,
-        "stream": args.stream,
-    }
+    # Use input_ids if provided, otherwise use text prompts
+    if input_ids is not None:
+        assert prompt is None
+        json_data = {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "temperature": args.temperature,
+                "max_new_tokens": (
+                    max_new_tokens
+                    if max_new_tokens is not None
+                    else args.max_new_tokens
+                ),
+                "frequency_penalty": args.frequency_penalty,
+                "presence_penalty": args.presence_penalty,
+            },
+            "return_logprob": args.return_logprob,
+            "stream": args.stream,
+            **(extra_params or {}),
+        }
+    else:
+        assert input_ids is None
+        json_data = {
+            "text": prompt,
+            "sampling_params": {
+                "temperature": args.temperature,
+                "max_new_tokens": (
+                    max_new_tokens
+                    if max_new_tokens is not None
+                    else args.max_new_tokens
+                ),
+                "frequency_penalty": args.frequency_penalty,
+                "presence_penalty": args.presence_penalty,
+            },
+            "return_logprob": args.return_logprob,
+            "stream": args.stream,
+            **(extra_params or {}),
+        }
     if args.sampling_seed is not None:
         # sglang server cannot parse None value for sampling_seed
@@ -116,6 +160,11 @@ def send_single(
         stream=args.stream,
     )
+    if response.status_code != 200:
+        ret = response.json()
+        print(f"Error: {ret}")
+        return None
     if args.stream:
         for chunk in response.iter_lines(decode_unicode=False):
             chunk = chunk.decode("utf-8")
@@ -125,24 +174,30 @@ def send_single(
                 ret = json.loads(chunk[5:].strip("\n"))
     else:
         ret = response.json()
-    ret = ret[0]
-    if response.status_code != 200:
-        print(ret)
-        return -1
+    if pick_first_result:
+        ret = ret[0] if isinstance(ret, list) else ret
-    return ret["text"]
+    if return_full_response:
+        return ret
+    else:
+        return ret["text"]
-def send_mixed(args, batch_size: int):
-    num_long_prompt = 0 if batch_size <= 10 else random.randint(1, 10)
-    num_prompt_1 = random.randint(1, batch_size - num_long_prompt)
-    num_prompt_2 = batch_size - num_prompt_1 - num_long_prompt
+def send_prefix(
+    args, batch_size: int, prompts: List[str], return_full_response: bool = False
+):
+    requests.post(f"http://{args.host}:{args.port}/flush_cache")
+    batch_data = []
+    sampled_indices = []
+    for _ in range(batch_size):
+        sampled_index = random.randint(0, len(prompts) - 1)
+        sampled_indices.append(sampled_index)
+        batch_data.append(prompts[sampled_index])
     json_data = {
-        "text": [PROMPT_1] * num_prompt_1
-        + [PROMPT_2] * num_prompt_2
-        + [LONG_PROMPT] * num_long_prompt,
+        "text": batch_data,
         "sampling_params": {
             "temperature": args.temperature,
             "max_new_tokens": args.max_new_tokens,
@@ -166,103 +221,171 @@ def send_mixed(args, batch_size: int):
         print(ret)
         return -1, -1, -1
-    prompt_1_ret = [ret[i]["text"] for i in range(num_prompt_1)]
-    prompt_2_ret = [
-        ret[i]["text"] for i in range(num_prompt_1, num_prompt_1 + num_prompt_2)
-    ]
-    long_prompt_ret = [
-        ret[i]["text"]
-        for i in range(
-            num_prompt_1 + num_prompt_2, num_prompt_1 + num_prompt_2 + num_long_prompt
-        )
-    ]
+    if return_full_response:
+        # Return full responses grouped by prompt index
+        ret_dict = {i: [] for i in range(len(prompts))}
+        for i in range(batch_size):
+            ret_dict[sampled_indices[i]].append(ret[i])
+        return ret_dict
+    else:
+        # Return only text grouped by prompt index
+        ret_dict = {i: [] for i in range(len(prompts))}
+        for i in range(batch_size):
+            ret_dict[sampled_indices[i]].append(ret[i]["text"])
+        return ret_dict
+def compare_logprobs(logprobs1, logprobs2, tolerance=0):
+    """Compare two logprobs sequences with a tolerance."""
+    if len(logprobs1) != len(logprobs2):
+        return False, f"Length mismatch: {len(logprobs1)} vs {len(logprobs2)}"
+    for i, (lp1, lp2) in enumerate(zip(logprobs1, logprobs2)):
+        # Each element is [logprob, token_id]
+        if lp1[1] != lp2[1]:
+            return False, f"Token ID mismatch at position {i}: {lp1[1]} vs {lp2[1]}"
+        if abs(lp1[0] - lp2[0]) > tolerance:
+            return (
+                False,
+                f"Logprob mismatch at position {i}: {lp1[0]} vs {lp2[0]} (diff: {abs(lp1[0] - lp2[0])})",
+            )
+    return True, "Logprobs match"
-    return prompt_1_ret, prompt_2_ret, long_prompt_ret
+def _test_mode_p_vs_d(args, batch_size):
+    print()
+    print(f"Execute: test p_vs_d {batch_size=}")
-def send_prefix(args, batch_size: int, prompts: List[str]):
+    random.seed(42)
+    args.return_logprob = True
+    query_extra_params = {
+        "logprob_start_len": 0,
+        "return_text_in_logprobs": True,
+    }
+    def _create_prompts():
+        ans = [PROMPT_1, PROMPT_2]
+        for i in range(batch_size - len(ans)):
+            end = random.randrange(1, 4096)
+            if random.random() < 0.5:
+                begin = 0
+            else:
+                begin = random.randrange(0, end)
+            ans.append(LONG_PROMPT[begin:end])
+        return ans[:batch_size]
+    # warmup + flush
+    send_single(args, input_ids=[1] * 64, max_new_tokens=65, return_full_response=True)
     requests.post(f"http://{args.host}:{args.port}/flush_cache")
-    batch_data = []
-    sampled_indices = []
-    for _ in range(batch_size):
-        sampled_index = random.randint(0, len(prompts) - 1)
-        sampled_indices.append(sampled_index)
-        batch_data.append(prompts[sampled_index])
+    prompts = _create_prompts()
-    json_data = {
-        "text": batch_data,
-        "sampling_params": {
-            "temperature": args.temperature,
-            "max_new_tokens": args.max_new_tokens,
-            "frequency_penalty": args.frequency_penalty,
-            "presence_penalty": args.presence_penalty,
-        },
-        "return_logprob": args.return_logprob,
-        "stream": args.stream,
-    }
+    resp_a = send_single(
+        args,
+        prompt=prompts,
+        max_new_tokens=args.max_new_tokens,
+        return_full_response=True,
+        pick_first_result=False,
+        extra_params=query_extra_params,
+    )
+    info_a = _extract_ids_and_logprobs(resp_a)
-    if args.sampling_seed is not None:
-        json_data["sampling_params"]["sampling_seed"] = args.sampling_seed
+    requests.post(f"http://{args.host}:{args.port}/flush_cache")
-    response = requests.post(
-        f"http://{args.host}:{args.port}/generate",
-        json=json_data,
-        stream=args.stream,
+    resp_b = send_single(
+        args,
+        input_ids=[x["io"].token_ids for x in info_a],
+        max_new_tokens=1,
+        return_full_response=True,
+        pick_first_result=False,
+        extra_params=query_extra_params,
     )
-    ret = response.json()
-    if response.status_code != 200:
-        print(ret)
-        return -1, -1, -1
+    info_b = _extract_ids_and_logprobs(resp_b)
-    ret_dict = {i: [] for i in range(len(prompts))}
-    for i in range(batch_size):
-        ret_dict[sampled_indices[i]].append(ret[i]["text"])
+    ans = []
+    for i, (info_a_item, info_b_item) in enumerate(zip(info_a, info_b, strict=True)):
+        print(f"Compare sequence {i} in batch...")
+        correct = TokenIdsAndLogprobs.compare(info_a_item["io"], info_b_item["input"])
+        ans.append(int(correct))
-    return ret_dict
+    return ans
-def test_deterministic(args):
-    # First do some warmups
-    for i in range(3):
-        send_single(args, 16, args.profile)
+@dataclasses.dataclass
+class TokenIdsAndLogprobs:
+    token_ids: List[int]
+    logprobs: List[float]
+    def __add__(self, other):
+        return TokenIdsAndLogprobs(
+            token_ids=self.token_ids + other.token_ids,
+            logprobs=self.logprobs + other.logprobs,
+        )
+    @classmethod
+    def compare(cls, a: "TokenIdsAndLogprobs", b: "TokenIdsAndLogprobs"):
+        assert len(a.token_ids) == len(b.token_ids)
+        token_match = a.token_ids == b.token_ids
+        logprobs_match = a.logprobs == b.logprobs
+        if token_match:
+            print(f"Token match: {a.token_ids}")
+        else:
+            print(f"❗Token mismatch: {a.token_ids=} {b.token_ids=}")
+        if logprobs_match:
+            print(f"Logprobs match:", a.logprobs)
+        else:
+            print(f"❗Logprobs mismatch")
+            print(
+                "    A:   ",
+                [f"{x:.10f}" if x is not None else "None" for x in a.logprobs],
+            )
+            print(
+                "    B:   ",
+                [f"{x:.10f}" if x is not None else "None" for x in b.logprobs],
+            )
+            diff = [
+                abs(x - y) if x is not None else float("nan")
+                for x, y in zip(a.logprobs, b.logprobs)
+            ]
+            print("    Diff:", [f"{x:.10e}" for x in diff])
+        return token_match and logprobs_match
+def _extract_ids_and_logprobs(responses):
+    def _extract_part(response, name):
+        token_ids, logprobs = [], []
+        for item in response["meta_info"][name]:
+            logprob, token_id, text = item
+            token_ids.append(token_id)
+            logprobs.append(logprob)
+        return TokenIdsAndLogprobs(token_ids=token_ids, logprobs=logprobs)
+    def _extract_one_response(response):
+        input = _extract_part(response, "input_token_logprobs")
+        output = _extract_part(response, "output_token_logprobs")
+        return dict(input=input, output=output, io=input + output)
+    if not isinstance(responses, list):
+        responses = [responses]
+    return [_extract_one_response(x) for x in responses]
+def test_deterministic(args):
     if args.test_mode == "single":
         # In single mode, we test the deterministic behavior by sending the same prompt in batch sizes ranging from 1 to n_trials.
         texts = []
         for i in range(1, args.n_trials + 1):
             batch_size = i
-            text = send_single(args, batch_size, args.profile)
+            text = send_single(args, args.profile, prompt=[PROMPT_1] * batch_size)
             text = text.replace("\n", " ")
             print(f"Trial {i} with batch size {batch_size}: {text}")
             texts.append(text)
         print(f"Total samples: {len(texts)}, Unique samples: {len(set(texts))}")
-    elif args.test_mode == "mixed":
-        # In mixed mode, we send a mixture of two short prompts and one long prompt in the same batch with batch size ranging from 1 to n_trials.
-        output_prompt_1 = []
-        output_prompt_2 = []
-        output_long_prompt = []
-        for i in range(1, args.n_trials + 1):
-            batch_size = i
-            ret_prompt_1, ret_prompt_2, ret_long_prompt = send_mixed(args, batch_size)
-            output_prompt_1.extend(ret_prompt_1)
-            output_prompt_2.extend(ret_prompt_2)
-            output_long_prompt.extend(ret_long_prompt)
-            print(
-                f"Testing Trial {i} with batch size {batch_size}, number of prompt 1: {len(ret_prompt_1)}, number of prompt 2: {len(ret_prompt_2)}, number of long prompt: {len(ret_long_prompt)}"
-            )
-        print(
-            f"Prompt 1: total samples: {len(output_prompt_1)}, Unique samples: {len(set(output_prompt_1))}"
-        )
-        print(
-            f"Prompt 2: total samples: {len(output_prompt_2)}, Unique samples: {len(set(output_prompt_2))}"
-        )
-        print(
-            f"Long prompt: total samples: {len(output_long_prompt)}, Unique samples: {len(set(output_long_prompt))}"
-        )
+        return [len(set(texts))]
     elif args.test_mode == "prefix":
         # In prefix mode, we create prompts from the same long prompt, with different lengths of common prefix.
@@ -270,21 +393,251 @@ def test_deterministic(args):
         num_prompts = len(len_prefix)
         outputs = {i: [] for i in range(4)}
         prompts = [LONG_PROMPT[: len_prefix[i]] for i in range(4)]
-        for i in range(1, args.n_trials + 1):
+        # If return_logprob is enabled, store full responses for comparison
+        if args.return_logprob:
+            full_responses = {i: [] for i in range(4)}
+        for i in range(args.n_start, args.n_start + args.n_trials):
             batch_size = i
-            ret_dict = send_prefix(args, batch_size, prompts)
+            ret_dict = send_prefix(
+                args, batch_size, prompts, return_full_response=args.return_logprob
+            )
             msg = f"Testing Trial {i} with batch size {batch_size},"
             for i in range(num_prompts):
                 msg += f" # prefix length {len_prefix[i]}: {len(ret_dict[i])},"
             print(msg)
             for i in range(num_prompts):
-                outputs[i].extend(ret_dict[i])
+                if args.return_logprob:
+                    # Store full response for logprob comparison
+                    full_responses[i].extend(ret_dict[i])
+                    # Extract text for determinism check
+                    outputs[i].extend([resp["text"] for resp in ret_dict[i]])
+                else:
+                    outputs[i].extend(ret_dict[i])
         for i in range(num_prompts):
             print(
                 f"Prompt {i} with prefix length {len_prefix[i]}: total samples: {len(outputs[i])}, Unique samples: {len(set(outputs[i]))}"
             )
+        results = []
+        for i in range(num_prompts):
+            results.append(len(set(outputs[i])))
+        # If logprobs are enabled, compare them across different batch sizes
+        if args.return_logprob:
+            print(f"\n{'='*60}")
+            print("Logprobs Comparison Across Batch Sizes")
+            print("=" * 60)
+            logprob_results = []
+            for prompt_idx in range(num_prompts):
+                print(
+                    f"\nPrompt {prompt_idx} (prefix length {len_prefix[prompt_idx]}):"
+                )
+                responses = full_responses[prompt_idx]
+                if len(responses) < 2:
+                    continue
+                # Compare all responses against the first one
+                reference = responses[0]
+                all_match = True
+                mismatches = []
+                for j, resp in enumerate(responses[1:], start=1):
+                    ref_logprobs = reference["meta_info"]["output_token_logprobs"]
+                    resp_logprobs = resp["meta_info"]["output_token_logprobs"]
+                    match, msg = compare_logprobs(ref_logprobs, resp_logprobs)
+                    if not match:
+                        print(f"  ✗ Sample {j+1}: {msg}")
+                        mismatches.append((j + 1, msg))
+                        all_match = False
+                if all_match:
+                    print(f"  ✓ All {len(responses)} samples have identical logprobs")
+                    logprob_results.append(1)
+                else:
+                    print(
+                        f"  ✗ Found {len(mismatches)} mismatches out of {len(responses)} samples"
+                    )
+                    logprob_results.append(0)
+            print(f"\n{'='*60}")
+            if all(r == 1 for r in logprob_results):
+                print("✓✓✓ Logprobs are identical across all batch sizes! ✓✓✓")
+            else:
+                print("✗✗✗ Some logprobs differ across batch sizes! ✗✗✗")
+        return results
+    elif args.test_mode == "radix_cache":
+        # Radix mode requires logprobs to compare results
+        args.return_logprob = True
+        print("\n=== Prefill Cache Consistency Test ===")
+        print(
+            "This test verifies prefill request produces consistent logprobs w/ and w/o cache.\n"
+        )
+        # We noticed that we cannot call flush cache before any request, otherwise it will hang.
+        warmup_response = send_single(
+            args, input_ids=[1] * 64, max_new_tokens=65, return_full_response=True
+        )
+        # Flush cache first to make sure there is no cache hit from previous tests
+        flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache")
+        print(f"Step 1: Generating random 64 token IDs...")
+        # Use a reasonable token ID range (e.g., 1-50000 for most tokenizers)
+        # Avoid special tokens like 0 (padding), 1 (BOS), 2 (EOS)
+        # set seed for random.randint
+        random.seed(42)
+        initial_token_ids = [random.randint(100, 50000) for _ in range(64)]
+        print(f"✓ Using {len(initial_token_ids)} initial tokens")
+        print(f"  Initial token IDs: {initial_token_ids}")
+        print(
+            f"\nStep 2: Generating 2 tokens from {len(initial_token_ids)} token prefix..."
+        )
+        first_response = send_single(
+            args,
+            input_ids=initial_token_ids,
+            max_new_tokens=100,
+            return_full_response=True,
+        )
+        first_output_text = first_response["text"]
+        first_output_token_ids = first_response["output_ids"]
+        first_output_logprobs = first_response["meta_info"]["output_token_logprobs"]
+        expected_token_id = first_output_token_ids[-1]
+        expected_logprob = first_output_logprobs[-1][0]
+        print(f"✓ Generated {len(first_output_token_ids)} tokens")
+        print(f'  Output text: "{first_output_text}"')
+        print(
+            f"\nStep 3: Generating with radix cache (164 tokens prefill, should hit > 128 tokens cache, based on page size)..."
+        )
+        prefix_token_ids = initial_token_ids + first_output_token_ids[:-1]
+        print(
+            f"  Prefix: {len(initial_token_ids)} initial + 64 generated = {len(prefix_token_ids)} tokens"
+        )
+        print(f"Using Prompt: {prefix_token_ids}")
+        cached_response = send_single(
+            args,
+            input_ids=prefix_token_ids,
+            max_new_tokens=1,
+            return_full_response=True,
+        )
+        cached_logprobs = cached_response["meta_info"]["output_token_logprobs"]
+        cached_token_data = cached_logprobs[0]
+        cached_logprob = cached_token_data[0]
+        cached_token_id = cached_token_data[1]
+        print(f"✓ Generated with cache:")
+        print(f"  Token ID: {cached_token_id}")
+        print(f"  Logprob:  {cached_logprob:.10f}")
+        print(f"\nStep 4: Flushing cache...")
+        flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache")
+        print(
+            f"\nStep 5: Generating without cache (same 164 tokens prefill, no cache)..."
+        )
+        print(f"Using Prompt: {prefix_token_ids}")
+        uncached_response = send_single(
+            args,
+            input_ids=prefix_token_ids,
+            max_new_tokens=1,
+            return_full_response=True,
+        )
+        uncached_logprobs = uncached_response["meta_info"]["output_token_logprobs"]
+        uncached_token_data = uncached_logprobs[0]
+        uncached_logprob = uncached_token_data[0]
+        uncached_token_id = uncached_token_data[1]
+        print(f"✓ Generated without cache:")
+        print(f"  Token ID: {uncached_token_id}")
+        print(f"  Logprob:  {uncached_logprob:.10f}")
+        # Step 6: Compare results
+        print(f"\n{'='*60}")
+        print("Comparison 1: Decode (Request 1) vs Prefill with Cache (Request 2)")
+        print("=" * 60)
+        # Compare first request (decode) vs second request (prefill with cache)
+        # We expect them to be different (different kernels)
+        decode_vs_prefill_token_match = expected_token_id == cached_token_id
+        decode_vs_prefill_logprob_match = expected_logprob == cached_logprob
+        print(
+            f"  Decode token (Request 1):          ID={expected_token_id}, logprob={expected_logprob:.10f}"
+        )
+        print(
+            f"  Prefill w/ cache token (Request 2): ID={cached_token_id}, logprob={cached_logprob:.10f}"
+        )
+        print(
+            f"  Token ID match: {'✓ YES' if decode_vs_prefill_token_match else '✗ NO'}"
+        )
+        print(
+            f"  Logprob match:  {'✓ YES' if decode_vs_prefill_logprob_match else '✗ NO'}"
+        )
+        if not decode_vs_prefill_logprob_match:
+            diff = abs(expected_logprob - cached_logprob)
+            print(f"  Logprob difference: {diff:.10e}")
+        print(f"  Note: We expect these to be DIFFERENT (decode vs prefill kernels)")
+        print(f"\n{'='*60}")
+        print(
+            "Comparison 2: Cached Prefill (Request 2) vs Uncached Prefill (Request 3)"
+        )
+        print("=" * 60)
+        # Main test: compare cached vs uncached prefill (should be identical)
+        token_match = cached_token_id == uncached_token_id
+        logprob_match = cached_logprob == uncached_logprob
+        print(
+            f"  Cached prefill token (Request 2):   ID={cached_token_id}, logprob={cached_logprob:.10f}"
+        )
+        print(
+            f"  Uncached prefill token (Request 3): ID={uncached_token_id}, logprob={uncached_logprob:.10f}"
+        )
+        print(f"  Token ID match: {'✓ YES' if token_match else '✗ NO'}")
+        if not token_match:
+            print(f"    Cached:   {cached_token_id}")
+            print(f"    Uncached: {uncached_token_id}")
+        print(f"  Logprob match:  {'✓ YES' if logprob_match else '✗ NO'}")
+        if not logprob_match:
+            print(f"    Cached:   {cached_logprob:.10f}")
+            print(f"    Uncached: {uncached_logprob:.10f}")
+            diff = abs(cached_logprob - uncached_logprob)
+            print(f"    Difference: {diff:.10e}")
+        print(f"  Note: We expect these to be IDENTICAL (both prefill kernels)")
+        print(f"\n{'='*60}")
+        if token_match and logprob_match:
+            print("✓✓✓ TEST PASSED - Radix cache is consistent! ✓✓✓")
+            return [1]
+        else:
+            print("✗✗✗ TEST FAILED - Radix cache produces different results! ✗✗✗")
+            return [0]
+    elif args.test_mode == "p_vs_d":
+        # TODO also extract other modes to functions
+        ans = []
+        for i in range(1, args.n_trials + 1):
+            ans += _test_mode_p_vs_d(args, batch_size=i)
+        return ans
     else:
         raise ValueError(f"Invalid test mode: {args.test_mode}")
@@ -294,4 +647,7 @@ if __name__ == "__main__":
     BenchArgs.add_cli_args(parser)
     args = parser.parse_args()
+    if args.sampling_seed is None:
+        args.sampling_seed = 42
     test_deterministic(args)

sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl