PyPI - sglang - Versions diffs - 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl - Mend

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (419) hide show

sglang/bench_one_batch.py +47 -28
sglang/bench_one_batch_server.py +41 -25
sglang/bench_serving.py +378 -160
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +10 -15
sglang/profiler.py +18 -1
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +13 -64
sglang/srt/configs/load_config.py +25 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +136 -25
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +0 -10
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +20 -11
sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +4 -2
sglang/srt/disaggregation/decode.py +123 -31
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +157 -19
sglang/srt/disaggregation/nixl/conn.py +69 -24
sglang/srt/disaggregation/prefill.py +96 -270
sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +63 -19
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +83 -80
sglang/srt/entrypoints/grpc_server.py +430 -234
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +195 -102
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +225 -37
sglang/srt/entrypoints/openai/serving_base.py +49 -2
sglang/srt/entrypoints/openai/serving_chat.py +29 -74
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +15 -1
sglang/srt/entrypoints/openai/serving_responses.py +5 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +58 -6
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +33 -4
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +20 -14
sglang/srt/function_call/glm4_moe_detector.py +1 -5
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/json_array_parser.py +0 -2
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/function_call/utils.py +2 -2
sglang/srt/grpc/compile_proto.py +3 -3
sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
sglang/srt/layers/activation.py +10 -1
sglang/srt/layers/attention/aiter_backend.py +3 -3
sglang/srt/layers/attention/ascend_backend.py +17 -1
sglang/srt/layers/attention/attention_registry.py +43 -23
sglang/srt/layers/attention/base_attn_backend.py +20 -1
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +24 -10
sglang/srt/layers/attention/flashinfer_backend.py +258 -22
sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
sglang/srt/layers/attention/flashmla_backend.py +2 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +0 -1
sglang/srt/layers/attention/nsa_backend.py +404 -90
sglang/srt/layers/attention/triton_backend.py +208 -34
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
sglang/srt/layers/attention/utils.py +89 -7
sglang/srt/layers/attention/vision.py +3 -3
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +12 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +17 -0
sglang/srt/layers/layernorm.py +64 -19
sglang/srt/layers/linear.py +9 -1
sglang/srt/layers/logits_processor.py +152 -17
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
sglang/srt/layers/moe/ep_moe/layer.py +154 -625
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
sglang/srt/layers/moe/moe_runner/runner.py +6 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +7 -6
sglang/srt/layers/moe/utils.py +20 -5
sglang/srt/layers/quantization/__init__.py +5 -58
sglang/srt/layers/quantization/awq.py +183 -9
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +27 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +152 -81
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +42 -14
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +125 -100
sglang/srt/layers/quantization/mxfp4.py +35 -68
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +23 -48
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +87 -20
sglang/srt/layers/quantization/w8a8_int8.py +30 -24
sglang/srt/layers/radix_attention.py +62 -9
sglang/srt/layers/rotary_embedding.py +686 -17
sglang/srt/layers/sampler.py +47 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +0 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/triton_backend.py +0 -1
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora_manager.py +24 -9
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +40 -16
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
sglang/srt/managers/cache_controller.py +48 -17
sglang/srt/managers/data_parallel_controller.py +146 -42
sglang/srt/managers/detokenizer_manager.py +40 -13
sglang/srt/managers/io_struct.py +69 -16
sglang/srt/managers/mm_utils.py +20 -18
sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
sglang/srt/managers/overlap_utils.py +96 -19
sglang/srt/managers/schedule_batch.py +241 -511
sglang/srt/managers/schedule_policy.py +15 -2
sglang/srt/managers/scheduler.py +420 -514
sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
sglang/srt/managers/tokenizer_manager.py +375 -95
sglang/srt/managers/tp_worker.py +212 -161
sglang/srt/managers/utils.py +78 -2
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/allocator_ascend.py +2 -2
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +13 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +16 -1
sglang/srt/mem_cache/hicache_storage.py +11 -2
sglang/srt/mem_cache/hiradix_cache.py +16 -3
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +517 -219
sglang/srt/mem_cache/memory_pool_host.py +0 -1
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +53 -19
sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
sglang/srt/mem_cache/storage/backend_factory.py +2 -2
sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +92 -26
sglang/srt/metrics/collector.py +31 -0
sglang/srt/metrics/func_timer.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +43 -5
sglang/srt/model_executor/forward_batch_info.py +71 -25
sglang/srt/model_executor/model_runner.py +362 -270
sglang/srt/model_executor/npu_graph_runner.py +2 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +424 -27
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +47 -28
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +13 -52
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +19 -3
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +418 -140
sglang/srt/models/dots_ocr.py +0 -2
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +13 -19
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/gemma3n_mm.py +1 -2
sglang/srt/models/glm4_moe.py +327 -382
sglang/srt/models/glm4_moe_nextn.py +6 -16
sglang/srt/models/glm4v.py +2 -1
sglang/srt/models/glm4v_moe.py +32 -199
sglang/srt/models/gpt_oss.py +5 -5
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +3 -1
sglang/srt/models/llama.py +2 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +5 -22
sglang/srt/models/longcat_flash_nextn.py +3 -14
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +13 -3
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +22 -1
sglang/srt/models/qwen2_5_vl.py +3 -3
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +15 -12
sglang/srt/models/qwen2_vl.py +5 -2
sglang/srt/models/qwen3.py +34 -4
sglang/srt/models/qwen3_moe.py +19 -37
sglang/srt/models/qwen3_next.py +7 -12
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +37 -33
sglang/srt/models/qwen3_vl_moe.py +57 -185
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +0 -1
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/utils.py +11 -1
sglang/srt/multimodal/processors/base_processor.py +7 -2
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +0 -1
sglang/srt/multimodal/processors/glm4v.py +2 -6
sglang/srt/multimodal/processors/internvl.py +0 -2
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +75 -16
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/reasoning_parser.py +28 -2
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +17 -22
sglang/srt/sampling/sampling_params.py +70 -2
sglang/srt/server_args.py +846 -163
sglang/srt/server_args_config_parser.py +1 -1
sglang/srt/single_batch_overlap.py +36 -31
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
sglang/srt/speculative/eagle_info.py +57 -18
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +138 -0
sglang/srt/speculative/eagle_worker.py +83 -280
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_info.py +2 -0
sglang/srt/speculative/spec_utils.py +38 -3
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/two_batch_overlap.py +28 -14
sglang/srt/utils/__init__.py +1 -1
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/utils/common.py +272 -82
sglang/srt/utils/hf_transformers_utils.py +44 -17
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/utils/profile_merger.py +199 -0
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +41 -0
sglang/test/runners.py +2 -0
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +3 -0
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/test_block_fp8.py +1 -2
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +463 -107
sglang/test/test_deterministic_utils.py +74 -0
sglang/test/test_disaggregation_utils.py +81 -0
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_utils.py +85 -20
sglang/version.py +1 -1
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/srt/models/vila.py +0 -306
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/mamba/mamba_utils.py DELETED Viewed

@@ -1,81 +0,0 @@
-# Adapted from: https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/mamba/mamba_utils.py
-from sglang.srt.distributed.utils import divide
-class MambaStateShapeCalculator:
-    @classmethod
-    def linear_attention_state_shape(
-        cls,
-        num_heads: int,
-        tp_size: int,
-        head_dim: int,
-    ) -> tuple[tuple[int, int, int], ...]:
-        state_shape = (num_heads // tp_size, head_dim, head_dim)
-        return (state_shape,)
-    @classmethod
-    def mamba1_state_shape(
-        cls,
-        tp_world_size: int,
-        intermediate_size: int,
-        state_size: int,
-        conv_kernel: int,
-    ) -> tuple[tuple[int, int], tuple[int, int]]:
-        conv_state_shape = (divide(intermediate_size, tp_world_size), conv_kernel - 1)
-        temporal_state_shape = (divide(intermediate_size, tp_world_size), state_size)
-        conv_state_shape = conv_state_shape[1], conv_state_shape[0]
-        return conv_state_shape, temporal_state_shape
-    @classmethod
-    def mamba2_state_shape(
-        cls,
-        tp_world_size: int,
-        intermediate_size: int,
-        n_groups: int,
-        num_heads: int,
-        head_dim: int,
-        state_size: int,
-        conv_kernel: int,
-    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
-        # if n_groups is not divisible by world_size, need to extend the shards
-        # to ensure all groups needed by a head is sharded along with it
-        n_groups = n_groups + cls.extra_groups_for_head_shards(n_groups, tp_world_size)
-        # heads and n_groups are TP-ed
-        conv_dim = intermediate_size + 2 * n_groups * state_size
-        # contiguous along 'dim' axis
-        conv_state_shape = (conv_kernel - 1, divide(conv_dim, tp_world_size))
-        # These are not TP-ed as they depend on A, dt_bias, D
-        # - they are typically small
-        #   e.g., (h_heads, head_dim, state_size) = (128, 64, 128)
-        temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, state_size)
-        return conv_state_shape, temporal_state_shape
-    @classmethod
-    def short_conv_state_shape(
-        cls,
-        tp_world_size: int,
-        intermediate_size: int,
-        conv_kernel: int,
-    ) -> tuple[tuple[int, int]]:
-        conv_dim = divide(intermediate_size, tp_world_size)
-        conv_state_shape = (conv_kernel - 1, conv_dim)
-        return (conv_state_shape,)
-    @classmethod
-    def extra_groups_for_head_shards(cls, ngroups: int, tp_size: int):
-        """Compute the increase in group numbers to account for
-        replication in order to accompany the head shards."""
-        # in the case ngoups % tp_size == 0, this will be zero
-        if ngroups % tp_size == 0:
-            return 0
-        # for n_groups == 1, this is exactly tp_size - n_groups
-        return tp_size - ngroups

sglang/srt/managers/tp_worker_overlap_thread.py DELETED Viewed

@@ -1,311 +0,0 @@
-# Copyright 2023-2024 SGLang Team
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A tensor parallel worker."""
-from __future__ import annotations
-import dataclasses
-import logging
-import signal
-import threading
-from queue import Queue
-from typing import TYPE_CHECKING, List, Optional, Tuple
-import psutil
-import torch
-from sglang.srt.managers.io_struct import (
-    DestroyWeightsUpdateGroupReqInput,
-    GetWeightsByNameReqInput,
-    InitWeightsSendGroupForRemoteInstanceReqInput,
-    InitWeightsUpdateGroupReqInput,
-    LoadLoRAAdapterReqInput,
-    SendWeightsToRemoteInstanceReqInput,
-    UnloadLoRAAdapterReqInput,
-    UpdateWeightFromDiskReqInput,
-    UpdateWeightsFromDistributedReqInput,
-    UpdateWeightsFromTensorReqInput,
-)
-from sglang.srt.managers.overlap_utils import FutureMap
-from sglang.srt.managers.schedule_batch import ModelWorkerBatch
-from sglang.srt.managers.tp_worker import TpModelWorker
-from sglang.srt.model_executor.forward_batch_info import ForwardBatchOutput
-from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import DynamicGradMode
-from sglang.utils import get_exception_traceback
-if TYPE_CHECKING:
-    from sglang.srt.managers.cache_controller import LayerDoneCounter
-logger = logging.getLogger(__name__)
-class TpModelWorkerClient:
-    """A tensor parallel model worker."""
-    def __init__(
-        self,
-        server_args: ServerArgs,
-        gpu_id: int,
-        tp_rank: int,
-        moe_ep_rank: int,
-        pp_rank: int,
-        dp_rank: Optional[int],
-        nccl_port: int,
-    ):
-        # Load the model
-        self.worker = TpModelWorker(
-            server_args, gpu_id, tp_rank, moe_ep_rank, pp_rank, dp_rank, nccl_port
-        )
-        self.max_running_requests = self.worker.max_running_requests
-        self.device = self.worker.device
-        self.gpu_id = gpu_id
-        # Init future mappings
-        self.future_map = FutureMap(self.max_running_requests, self.device)
-        # Launch threads
-        self.input_queue = Queue[Tuple[ModelWorkerBatch, int, torch.Event]]()
-        self.output_queue = Queue()
-        self.forward_stream = torch.get_device_module(self.device).Stream()
-        self.forward_thread = threading.Thread(
-            target=self.forward_thread_func,
-        )
-        self.forward_thread.start()
-        self.parent_process = psutil.Process().parent()
-        self.scheduler_stream = torch.get_device_module(self.device).current_stream()
-        if self.device == "cpu":
-            self.scheduler_stream.synchronize = lambda: None  # No-op for CPU
-        self.hicache_layer_transfer_counter = None
-    def register_hicache_layer_transfer_counter(self, counter: LayerDoneCounter):
-        self.hicache_layer_transfer_counter = counter
-    def get_worker_info(self):
-        return self.worker.get_worker_info()
-    def get_tokens_per_layer_info(self):
-        return self.worker.get_tokens_per_layer_info()
-    @property
-    def sliding_window_size(self) -> Optional[int]:
-        return self.worker.sliding_window_size
-    @property
-    def is_hybrid(self) -> bool:
-        return self.worker.is_hybrid
-    def get_pad_input_ids_func(self):
-        return self.worker.get_pad_input_ids_func()
-    def get_tp_group(self):
-        return self.worker.get_tp_group()
-    def get_attention_tp_group(self):
-        return self.worker.get_attention_tp_group()
-    def get_attention_tp_cpu_group(self):
-        return self.worker.get_attention_tp_cpu_group()
-    def get_memory_pool(self):
-        return (
-            self.worker.model_runner.req_to_token_pool,
-            self.worker.model_runner.token_to_kv_pool_allocator,
-        )
-    def get_kv_cache(self):
-        return self.worker.model_runner.token_to_kv_pool
-    def forward_thread_func(self):
-        try:
-            with torch.get_device_module(self.device).stream(self.forward_stream):
-                self.forward_thread_func_()
-        except Exception:
-            traceback = get_exception_traceback()
-            logger.error(f"TpModelWorkerClient hit an exception: {traceback}")
-            self.parent_process.send_signal(signal.SIGQUIT)
-    @DynamicGradMode()
-    def forward_thread_func_(self):
-        batch_pt = 0
-        batch_lists: List = [None] * 2
-        while True:
-            model_worker_batch, future_map_ct, sync_event = self.input_queue.get()
-            if not model_worker_batch:
-                break
-            sync_event.wait()
-            # Keep a reference of model_worker_batch by storing it into a list.
-            # Otherwise, the tensor members of model_worker_batch will be released
-            # by pytorch and cause CUDA illegal memory access errors.
-            batch_lists[batch_pt % 2] = model_worker_batch
-            batch_pt += 1
-            # Create event
-            copy_done = torch.get_device_module(self.device).Event()
-            # Resolve future tokens in the input
-            self.future_map.resolve_future(model_worker_batch)
-            # Run forward
-            forward_batch_output = self.worker.forward_batch_generation(
-                model_worker_batch,
-                model_worker_batch.launch_done,
-            )
-            logits_output, next_token_ids, can_run_cuda_graph = (
-                forward_batch_output.logits_output,
-                forward_batch_output.next_token_ids,
-                forward_batch_output.can_run_cuda_graph,
-            )
-            # Update the future token ids map
-            bs = len(model_worker_batch.seq_lens)
-            if model_worker_batch.is_prefill_only:
-                # For prefill-only requests, create dummy token IDs on CPU
-                next_token_ids = torch.zeros(bs, dtype=torch.long)
-            # store the future indices into future map
-            self.future_map.store_to_map(future_map_ct, bs, next_token_ids)
-            # Copy results to the CPU
-            if model_worker_batch.return_logprob:
-                if logits_output.next_token_logprobs is not None:
-                    logits_output.next_token_logprobs = (
-                        logits_output.next_token_logprobs.to("cpu", non_blocking=True)
-                    )
-                if logits_output.input_token_logprobs is not None:
-                    logits_output.input_token_logprobs = (
-                        logits_output.input_token_logprobs.to("cpu", non_blocking=True)
-                    )
-            if logits_output.hidden_states is not None:
-                logits_output.hidden_states = logits_output.hidden_states.to(
-                    "cpu", non_blocking=True
-                )
-            # Only copy to CPU if not already on CPU
-            if next_token_ids.device.type != "cpu":
-                next_token_ids = next_token_ids.to("cpu", non_blocking=True)
-            copy_done.record()
-            self.output_queue.put(
-                (copy_done, logits_output, next_token_ids, can_run_cuda_graph)
-            )
-    def resolve_last_batch_result(self, launch_done: Optional[threading.Event] = None):
-        """
-        This function is called to resolve the last batch result and
-        wait for the current batch to be launched. Used in overlap mode.
-        """
-        copy_done, logits_output, next_token_ids, can_run_cuda_graph = (
-            self.output_queue.get()
-        )
-        if launch_done is not None:
-            launch_done.wait()
-        copy_done.synchronize()
-        if logits_output.next_token_logprobs is not None:
-            logits_output.next_token_logprobs = (
-                logits_output.next_token_logprobs.tolist()
-            )
-        if logits_output.input_token_logprobs is not None:
-            logits_output.input_token_logprobs = tuple(
-                logits_output.input_token_logprobs.tolist()
-            )
-        next_token_ids = next_token_ids.tolist()
-        return logits_output, next_token_ids, can_run_cuda_graph
-    def forward_batch_generation(
-        self, model_worker_batch: ModelWorkerBatch
-    ) -> ForwardBatchOutput:
-        # Create a new copy of sampling_info because it will be updated in-place by the scheduler for the next batch.
-        sampling_info = model_worker_batch.sampling_info
-        sampling_info.update_penalties()
-        model_worker_batch.sampling_info = self.cur_sampling_info = dataclasses.replace(
-            sampling_info,
-            sampling_info_done=threading.Event(),
-            penalizer_orchestrator=None,
-        )
-        # A cuda stream sync here to avoid the cuda illegal memory access error.
-        sync_event = torch.get_device_module(self.device).Event()
-        sync_event.record(self.scheduler_stream)
-        # Push a new batch to the queue
-        bs = len(model_worker_batch.seq_lens)
-        cur_future_map_ct = self.future_map.update_ct(bs)
-        self.input_queue.put((model_worker_batch, cur_future_map_ct, sync_event))
-        # get this forward batch's future token ids
-        future_next_token_ids = self.future_map.update_next_future(
-            cur_future_map_ct, bs
-        )
-        return ForwardBatchOutput(
-            next_token_ids=future_next_token_ids,
-            can_run_cuda_graph=False,
-        )
-    def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
-        success, message = self.worker.update_weights_from_disk(recv_req)
-        return success, message
-    def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
-        success, message = self.worker.init_weights_update_group(recv_req)
-        return success, message
-    def destroy_weights_update_group(self, recv_req: DestroyWeightsUpdateGroupReqInput):
-        success, message = self.worker.destroy_weights_update_group(recv_req)
-        return success, message
-    def init_weights_send_group_for_remote_instance(
-        self, recv_req: InitWeightsSendGroupForRemoteInstanceReqInput
-    ):
-        success, message = self.worker.init_weights_send_group_for_remote_instance(
-            recv_req
-        )
-        return success, message
-    def send_weights_to_remote_instance(
-        self, recv_req: SendWeightsToRemoteInstanceReqInput
-    ):
-        success, message = self.worker.send_weights_to_remote_instance(recv_req)
-        return success, message
-    def update_weights_from_distributed(
-        self, recv_req: UpdateWeightsFromDistributedReqInput
-    ):
-        success, message = self.worker.update_weights_from_distributed(recv_req)
-        return success, message
-    def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
-        success, message = self.worker.update_weights_from_tensor(recv_req)
-        return success, message
-    def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
-        return self.worker.get_weights_by_name(recv_req)
-    def load_lora_adapter(self, recv_req: LoadLoRAAdapterReqInput):
-        return self.worker.load_lora_adapter(recv_req)
-    def unload_lora_adapter(self, recv_req: UnloadLoRAAdapterReqInput):
-        return self.worker.unload_lora_adapter(recv_req)
-    def can_run_lora_batch(self, lora_ids: list[str]) -> bool:
-        return self.worker.can_run_lora_batch(lora_ids)
-    def __delete__(self):
-        self.input_queue.put((None, None))
-        self.copy_queue.put((None, None, None))

sglang/srt/models/vila.py DELETED Viewed

@@ -1,306 +0,0 @@
-import logging
-from typing import Any, Dict, Iterable, List, Optional, Tuple, cast
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_outputs import BaseModelOutputWithPooling
-from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
-from transformers.models.siglip import SiglipVisionConfig, SiglipVisionModel
-import sglang.srt.managers.mm_utils as mm_utils
-import sglang.srt.model_loader.weight_utils as weight_utils
-import sglang.srt.utils as utils
-from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
-from sglang.srt.layers.pooler import Pooler, PoolingType
-from sglang.srt.layers.quantization.base_config import QuantizationConfig
-from sglang.srt.managers.mm_utils import MultiModalityDataPaddingPatternMultimodalTokens
-from sglang.srt.managers.schedule_batch import (
-    Modality,
-    MultimodalDataItem,
-    MultimodalInputs,
-)
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.models.qwen2 import Qwen2ForCausalLM
-logger = logging.getLogger(__name__)
-##### BEGIN COPY configuration.py #####
-class VILAConfig(PretrainedConfig):
-    # Class attributes.
-    model_type: str = "vila"
-    sub_configs: Dict[str, PretrainedConfig] = {
-        "text_config": Qwen2Config(),
-        "vision_config": SiglipVisionConfig(),
-    }
-    _auto_class: Optional[str] = "AutoConfig"
-    # Configuration for sub-modules.
-    text_config: Qwen2Config = Qwen2Config()
-    vision_config: SiglipVisionConfig = SiglipVisionConfig()
-    # Model configuration.
-    hidden_size: int
-    image_token_id: int
-    mm_hidden_size: int
-    mm_projector_type: str
-    mm_vision_select_feature: str
-    mm_vision_select_layer: int
-    video_token_id: int
-    def __init__(
-        self,
-        text_config: Optional[Dict[str, Any]] = None,
-        vision_config: Optional[Dict[str, Any]] = None,
-        *,
-        hidden_size: int = 1536,
-        image_token_id: int = 151649,
-        mm_hidden_size: int = 1152,
-        mm_projector_type: str = "mlp_downsample_3x3_fix",
-        mm_vision_select_feature: str = "cls_patch",
-        mm_vision_select_layer: int = -2,
-        video_token_id: int = 151650,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.text_config = Qwen2Config(**text_config) if text_config else Qwen2Config()
-        self.vision_config = (
-            SiglipVisionConfig(**vision_config)
-            if vision_config
-            else SiglipVisionConfig()
-        )
-        self.hidden_size = hidden_size
-        self.image_token_id = image_token_id
-        self.mm_hidden_size = mm_hidden_size
-        self.mm_projector_type = mm_projector_type
-        self.mm_vision_select_feature = mm_vision_select_feature
-        self.mm_vision_select_layer = mm_vision_select_layer
-        self.video_token_id = video_token_id
-##### END COPY configuration.py #####
-##### BEGIN COPY modeling_vila.py #####
-class DownSample3x3BlockFix(nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x: The input tensor of shape (batch_size, sequence_length, mm_hidden_size).
-        Returns:
-            The output tensor of shape (batch_size, image_pad_len, mm_hidden_size * 9).
-        """
-        batch_size, sequence_length, hidden_size = x.shape
-        feat_size = int(sequence_length**0.5)
-        if feat_size**2 != sequence_length:
-            raise ValueError(
-                f"Cannot take square root: sequence_length {sequence_length} is not a perfect square"
-            )
-        features = x.reshape(batch_size, feat_size, feat_size, hidden_size)
-        pad_after = (3 - feat_size % 3) % 3
-        if pad_after > 0:
-            features = F.pad(features, (0, 0, 0, pad_after, 0, pad_after))
-            feat_size = feat_size + pad_after
-        features = features.reshape(
-            batch_size, feat_size // 3, 3, feat_size // 3, 3, hidden_size
-        )
-        features = features.permute(0, 1, 3, 2, 4, 5).contiguous()
-        features = features.reshape(batch_size, -1, 9 * hidden_size)
-        return features
-class MultimodalProjector(nn.Module):
-    layers: nn.Sequential
-    def __init__(
-        self,
-        config: VILAConfig,
-        *args,
-        **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        if config.mm_projector_type == "mlp_downsample_3x3_fix":
-            self.layers = nn.Sequential(
-                DownSample3x3BlockFix(),
-                nn.LayerNorm(config.mm_hidden_size * 9),
-                nn.Linear(
-                    config.mm_hidden_size * 9,
-                    config.mm_hidden_size * 3,
-                ),
-                nn.GELU(),
-                nn.LayerNorm(config.vision_config.hidden_size * 3),
-                nn.Linear(config.vision_config.hidden_size * 3, config.hidden_size),
-                nn.GELU(),
-                nn.Linear(config.hidden_size, config.hidden_size),
-            )
-        else:
-            raise NotImplementedError(
-                f"Unsupported mm_projector_type: {config.mm_projector_type}"
-            )
-        self.layers.type(config.torch_dtype)
-    @property
-    def device(self) -> torch.device:
-        return next(self.parameters()).device
-    @property
-    def dtype(self) -> torch.dtype:
-        return next(self.parameters()).dtype
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x: The input tensor of shape (batch_size, sequence_length, mm_hidden_size).
-        Returns:
-            The output tensor of shape (batch_size, image_pad_len, hidden_size).
-        """
-        return self.layers(x.to(device=self.device, dtype=self.dtype))
-##### END COPY modeling_vila.py #####
-class VILAForConditionalGeneration(nn.Module):
-    config: VILAConfig
-    quant_config: Optional[QuantizationConfig]
-    logits_processor: LogitsProcessor
-    pooler: Pooler
-    llm: Qwen2ForCausalLM
-    mm_projector: MultimodalProjector
-    vision_tower: SiglipVisionModel
-    def __init__(
-        self,
-        config: VILAConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.quant_config = quant_config
-        self.logits_processor = LogitsProcessor(config)
-        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
-        self.llm = Qwen2ForCausalLM(
-            config=config.text_config,
-            quant_config=quant_config,
-            prefix=utils.add_prefix("llm", prefix),
-        )
-        self.mm_projector = MultimodalProjector(config)
-        self.vision_tower = SiglipVisionModel(config.vision_config)
-    @property
-    def dtype(self) -> torch.dtype:
-        return self.config.torch_dtype
-    def forward(
-        self,
-        input_ids: Tensor,
-        positions: Tensor,
-        forward_batch: ForwardBatch,
-        get_embedding: bool = False,
-    ) -> LogitsProcessorOutput:
-        output = mm_utils.general_mm_embed_routine(
-            input_ids=input_ids,
-            forward_batch=forward_batch,
-            language_model=self.llm,
-            data_embedding_funcs={
-                Modality.IMAGE: self.get_image_feature,
-            },
-            get_embedding=get_embedding,
-            positions=positions,
-        )
-        return cast(LogitsProcessorOutput, output)
-    def get_image_feature(self, mm_input: List[MultimodalDataItem]) -> Tensor:
-        pixel_values = cast(Tensor, mm_input[0].feature)
-        ##### BEGIN COPY modeling_vila.py #####
-        vision_tower_output: BaseModelOutputWithPooling = self.vision_tower.__call__(
-            pixel_values.to(
-                device=self.vision_tower.device, dtype=self.vision_tower.dtype
-            ),
-            output_hidden_states=True,
-        )
-        mm_projector_input = self._vision_tower_output_to_mm_projector_input(
-            vision_tower_output
-        )
-        image_embedding: Tensor = self.mm_projector.__call__(
-            mm_projector_input.to(
-                device=self.mm_projector.device, dtype=self.mm_projector.dtype
-            )
-        )
-        ##### END COPY modeling_vila.py #####
-        return image_embedding
-    def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> None:
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if name.startswith("llm."):
-                self.llm.load_weights([(name[len("llm.") :], loaded_weight)])
-            else:
-                param = params_dict[name]
-                weight_loader = getattr(
-                    param, "weight_loader", weight_utils.default_weight_loader
-                )
-                weight_loader(param, loaded_weight)
-    def pad_input_ids(
-        self, input_ids: List[int], mm_inputs: MultimodalInputs
-    ) -> List[int]:
-        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
-        return pattern.pad_input_tokens(input_ids, mm_inputs)
-    ##### BEGIN COPY modeling_vila.py #####
-    def _vision_tower_output_to_mm_projector_input(
-        self,
-        vision_tower_output: BaseModelOutputWithPooling,
-    ) -> Tensor:
-        assert vision_tower_output.hidden_states is not None
-        selected_layer_hidden_states = vision_tower_output.hidden_states[
-            self.config.mm_vision_select_layer
-        ]
-        if self.config.mm_vision_select_feature == "cls_patch":
-            return selected_layer_hidden_states
-        else:
-            raise NotImplementedError(
-                f"Unsupported mm_vision_select_feature: {self.config.mm_vision_select_feature}"
-            )
-    ##### END COPY modeling_vila.py #####
-EntryClass = [VILAForConditionalGeneration]

sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl