sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
- sglang/bench_one_batch_server.py +41 -25
- sglang/bench_serving.py +378 -160
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +10 -15
- sglang/profiler.py +18 -1
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +13 -64
- sglang/srt/configs/load_config.py +25 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +136 -25
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +0 -10
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +20 -11
- sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +4 -2
- sglang/srt/disaggregation/decode.py +123 -31
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +157 -19
- sglang/srt/disaggregation/nixl/conn.py +69 -24
- sglang/srt/disaggregation/prefill.py +96 -270
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +63 -19
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +83 -80
- sglang/srt/entrypoints/grpc_server.py +430 -234
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +195 -102
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +225 -37
- sglang/srt/entrypoints/openai/serving_base.py +49 -2
- sglang/srt/entrypoints/openai/serving_chat.py +29 -74
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +15 -1
- sglang/srt/entrypoints/openai/serving_responses.py +5 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +58 -6
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +33 -4
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +20 -14
- sglang/srt/function_call/glm4_moe_detector.py +1 -5
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/json_array_parser.py +0 -2
- sglang/srt/function_call/minimax_m2.py +367 -0
- sglang/srt/function_call/utils.py +2 -2
- sglang/srt/grpc/compile_proto.py +3 -3
- sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
- sglang/srt/layers/activation.py +10 -1
- sglang/srt/layers/attention/aiter_backend.py +3 -3
- sglang/srt/layers/attention/ascend_backend.py +17 -1
- sglang/srt/layers/attention/attention_registry.py +43 -23
- sglang/srt/layers/attention/base_attn_backend.py +20 -1
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +24 -10
- sglang/srt/layers/attention/flashinfer_backend.py +258 -22
- sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
- sglang/srt/layers/attention/flashmla_backend.py +2 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
- sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +0 -1
- sglang/srt/layers/attention/nsa_backend.py +404 -90
- sglang/srt/layers/attention/triton_backend.py +208 -34
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
- sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
- sglang/srt/layers/attention/utils.py +89 -7
- sglang/srt/layers/attention/vision.py +3 -3
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +12 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +17 -0
- sglang/srt/layers/layernorm.py +64 -19
- sglang/srt/layers/linear.py +9 -1
- sglang/srt/layers/logits_processor.py +152 -17
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
- sglang/srt/layers/moe/ep_moe/layer.py +154 -625
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
- sglang/srt/layers/moe/moe_runner/runner.py +6 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
- sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
- sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +7 -6
- sglang/srt/layers/moe/utils.py +20 -5
- sglang/srt/layers/quantization/__init__.py +5 -58
- sglang/srt/layers/quantization/awq.py +183 -9
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +27 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +152 -81
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +42 -14
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gguf.py +566 -0
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +125 -100
- sglang/srt/layers/quantization/mxfp4.py +35 -68
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +23 -48
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +87 -20
- sglang/srt/layers/quantization/w8a8_int8.py +30 -24
- sglang/srt/layers/radix_attention.py +62 -9
- sglang/srt/layers/rotary_embedding.py +686 -17
- sglang/srt/layers/sampler.py +47 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +0 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/triton_backend.py +0 -1
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora_manager.py +24 -9
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +40 -16
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
- sglang/srt/managers/cache_controller.py +48 -17
- sglang/srt/managers/data_parallel_controller.py +146 -42
- sglang/srt/managers/detokenizer_manager.py +40 -13
- sglang/srt/managers/io_struct.py +69 -16
- sglang/srt/managers/mm_utils.py +20 -18
- sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
- sglang/srt/managers/overlap_utils.py +96 -19
- sglang/srt/managers/schedule_batch.py +241 -511
- sglang/srt/managers/schedule_policy.py +15 -2
- sglang/srt/managers/scheduler.py +420 -514
- sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
- sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
- sglang/srt/managers/tokenizer_manager.py +375 -95
- sglang/srt/managers/tp_worker.py +212 -161
- sglang/srt/managers/utils.py +78 -2
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/allocator_ascend.py +2 -2
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +13 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +16 -1
- sglang/srt/mem_cache/hicache_storage.py +11 -2
- sglang/srt/mem_cache/hiradix_cache.py +16 -3
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +517 -219
- sglang/srt/mem_cache/memory_pool_host.py +0 -1
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +53 -19
- sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
- sglang/srt/mem_cache/storage/backend_factory.py +2 -2
- sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +92 -26
- sglang/srt/metrics/collector.py +31 -0
- sglang/srt/metrics/func_timer.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +43 -5
- sglang/srt/model_executor/forward_batch_info.py +71 -25
- sglang/srt/model_executor/model_runner.py +362 -270
- sglang/srt/model_executor/npu_graph_runner.py +2 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +424 -27
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +47 -28
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +13 -52
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +19 -3
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +418 -140
- sglang/srt/models/dots_ocr.py +0 -2
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +13 -19
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/gemma3n_mm.py +1 -2
- sglang/srt/models/glm4_moe.py +327 -382
- sglang/srt/models/glm4_moe_nextn.py +6 -16
- sglang/srt/models/glm4v.py +2 -1
- sglang/srt/models/glm4v_moe.py +32 -199
- sglang/srt/models/gpt_oss.py +5 -5
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +3 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +5 -22
- sglang/srt/models/longcat_flash_nextn.py +3 -14
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/minimax_m2.py +922 -0
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +13 -3
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/nvila.py +355 -0
- sglang/srt/models/nvila_lite.py +184 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2.py +22 -1
- sglang/srt/models/qwen2_5_vl.py +3 -3
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +15 -12
- sglang/srt/models/qwen2_vl.py +5 -2
- sglang/srt/models/qwen3.py +34 -4
- sglang/srt/models/qwen3_moe.py +19 -37
- sglang/srt/models/qwen3_next.py +7 -12
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +37 -33
- sglang/srt/models/qwen3_vl_moe.py +57 -185
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +0 -1
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/utils.py +11 -1
- sglang/srt/multimodal/processors/base_processor.py +7 -2
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +0 -1
- sglang/srt/multimodal/processors/glm4v.py +2 -6
- sglang/srt/multimodal/processors/internvl.py +0 -2
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +75 -16
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/reasoning_parser.py +28 -2
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +17 -22
- sglang/srt/sampling/sampling_params.py +70 -2
- sglang/srt/server_args.py +846 -163
- sglang/srt/server_args_config_parser.py +1 -1
- sglang/srt/single_batch_overlap.py +36 -31
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
- sglang/srt/speculative/eagle_info.py +57 -18
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +138 -0
- sglang/srt/speculative/eagle_worker.py +83 -280
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_info.py +2 -0
- sglang/srt/speculative/spec_utils.py +38 -3
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/two_batch_overlap.py +28 -14
- sglang/srt/utils/__init__.py +1 -1
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/utils/common.py +272 -82
- sglang/srt/utils/hf_transformers_utils.py +44 -17
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +41 -0
- sglang/test/runners.py +2 -0
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +3 -0
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/test_block_fp8.py +1 -2
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +463 -107
- sglang/test/test_deterministic_utils.py +74 -0
- sglang/test/test_disaggregation_utils.py +81 -0
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_utils.py +85 -20
- sglang/version.py +1 -1
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/srt/models/vila.py +0 -306
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# Copyright 2025 SGLang Team
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
"""Common config utils for mamba2 - NemotronH, FalconH1, Qwen3Next, etc."""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import torch
|
|
20
|
+
|
|
21
|
+
from sglang.srt.distributed.utils import divide
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def extra_groups_for_head_shards(ngroups: int, tp_size: int):
|
|
25
|
+
"""Compute the increase in group numbers to account for
|
|
26
|
+
replication in order to accompany the head shards."""
|
|
27
|
+
|
|
28
|
+
# in the case ngoups % tp_size == 0, this will be zero
|
|
29
|
+
if ngroups % tp_size == 0:
|
|
30
|
+
return 0
|
|
31
|
+
|
|
32
|
+
# for n_groups == 1, this is exactly tp_size - n_groups
|
|
33
|
+
return tp_size - ngroups
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(kw_only=True, frozen=True)
|
|
37
|
+
class Mamba2StateShape:
|
|
38
|
+
conv: tuple[int, int]
|
|
39
|
+
temporal: tuple[int, int, int]
|
|
40
|
+
|
|
41
|
+
intermediate_size: int
|
|
42
|
+
conv_dim: int
|
|
43
|
+
ssm_state_size: int
|
|
44
|
+
num_heads: int
|
|
45
|
+
head_dim: int
|
|
46
|
+
state_size: int
|
|
47
|
+
conv_kernel: int
|
|
48
|
+
|
|
49
|
+
@staticmethod
|
|
50
|
+
def create(
|
|
51
|
+
*,
|
|
52
|
+
tp_world_size: int,
|
|
53
|
+
intermediate_size: int,
|
|
54
|
+
n_groups: int,
|
|
55
|
+
num_heads: int,
|
|
56
|
+
head_dim: int,
|
|
57
|
+
state_size: int,
|
|
58
|
+
conv_kernel: int,
|
|
59
|
+
) -> "Mamba2StateShape":
|
|
60
|
+
# if n_groups is not divisible by world_size, need to extend the shards
|
|
61
|
+
# to ensure all groups needed by a head is sharded along with it
|
|
62
|
+
if n_groups % tp_world_size != 0:
|
|
63
|
+
extra_groups = extra_groups_for_head_shards(n_groups, tp_world_size)
|
|
64
|
+
n_groups += extra_groups
|
|
65
|
+
# heads and n_groups are TP-ed
|
|
66
|
+
conv_dim = intermediate_size + 2 * n_groups * state_size
|
|
67
|
+
|
|
68
|
+
# contiguous along 'dim' axis
|
|
69
|
+
conv_state_shape = divide(conv_dim, tp_world_size), conv_kernel - 1
|
|
70
|
+
|
|
71
|
+
# These are not TP-ed as they depend on A, dt_bias, D
|
|
72
|
+
# - they are typically small
|
|
73
|
+
# e.g., QWen3-Next: (32, 128, 128)
|
|
74
|
+
temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, state_size)
|
|
75
|
+
return Mamba2StateShape(
|
|
76
|
+
conv=conv_state_shape,
|
|
77
|
+
temporal=temporal_state_shape,
|
|
78
|
+
intermediate_size=intermediate_size,
|
|
79
|
+
conv_dim=conv_dim,
|
|
80
|
+
ssm_state_size=state_size,
|
|
81
|
+
num_heads=num_heads,
|
|
82
|
+
head_dim=head_dim,
|
|
83
|
+
state_size=state_size,
|
|
84
|
+
conv_kernel=conv_kernel,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass(kw_only=True, frozen=True)
|
|
89
|
+
class Mamba2StateDType:
|
|
90
|
+
conv: torch.dtype
|
|
91
|
+
temporal: torch.dtype
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
CONV_DTYPE = torch.bfloat16
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def mamba2_state_dtype() -> Mamba2StateDType:
|
|
98
|
+
dtype_map = {
|
|
99
|
+
"float32": torch.float32,
|
|
100
|
+
"bfloat16": torch.bfloat16,
|
|
101
|
+
}
|
|
102
|
+
ssm_dtype = dtype_map[os.environ["SGLANG_MAMBA_SSM_DTYPE"]]
|
|
103
|
+
return Mamba2StateDType(conv=CONV_DTYPE, temporal=ssm_dtype)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass(kw_only=True, frozen=True)
|
|
107
|
+
class Mamba2CacheParams:
|
|
108
|
+
shape: Mamba2StateShape
|
|
109
|
+
dtype: Mamba2StateDType = field(default_factory=mamba2_state_dtype)
|
|
110
|
+
layers: list[int]
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def mamba_cache_per_req(self) -> int:
|
|
114
|
+
return (
|
|
115
|
+
int(np.prod(self.shape.conv)) * self.dtype.conv.itemsize
|
|
116
|
+
+ int(np.prod(self.shape.temporal)) * self.dtype.temporal.itemsize
|
|
117
|
+
) * len(self.layers)
|
|
@@ -17,7 +17,7 @@ import logging
|
|
|
17
17
|
import math
|
|
18
18
|
import os
|
|
19
19
|
from enum import Enum, IntEnum, auto
|
|
20
|
-
from typing import List, Optional, Set, Union
|
|
20
|
+
from typing import Any, List, Optional, Set, Union
|
|
21
21
|
|
|
22
22
|
import torch
|
|
23
23
|
from transformers import PretrainedConfig
|
|
@@ -53,7 +53,11 @@ def is_deepseek_nsa(config: PretrainedConfig) -> bool:
|
|
|
53
53
|
return (
|
|
54
54
|
config.architectures is not None
|
|
55
55
|
and config.architectures[0]
|
|
56
|
-
in [
|
|
56
|
+
in [
|
|
57
|
+
"DeepseekV3ForCausalLM",
|
|
58
|
+
"DeepseekV32ForCausalLM",
|
|
59
|
+
"DeepseekV3ForCausalLMNextN",
|
|
60
|
+
]
|
|
57
61
|
and getattr(config, "index_topk", None) is not None
|
|
58
62
|
)
|
|
59
63
|
|
|
@@ -87,8 +91,12 @@ class ModelConfig:
|
|
|
87
91
|
quantization: Optional[str] = None,
|
|
88
92
|
override_config_file: Optional[str] = None,
|
|
89
93
|
is_draft_model: bool = False,
|
|
90
|
-
hybrid_kvcache_ratio: Optional[
|
|
94
|
+
hybrid_kvcache_ratio: Optional[
|
|
95
|
+
float
|
|
96
|
+
] = None, # TODO: remove this, it is not a model config
|
|
91
97
|
model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
|
|
98
|
+
sampling_defaults: str = "openai",
|
|
99
|
+
quantize_and_serve: bool = False,
|
|
92
100
|
) -> None:
|
|
93
101
|
# Parse args
|
|
94
102
|
self.model_path = model_path
|
|
@@ -96,6 +104,11 @@ class ModelConfig:
|
|
|
96
104
|
self.quantization = quantization
|
|
97
105
|
self.is_draft_model = is_draft_model
|
|
98
106
|
self.model_impl = model_impl
|
|
107
|
+
self.sampling_defaults = sampling_defaults
|
|
108
|
+
self.quantize_and_serve = quantize_and_serve
|
|
109
|
+
|
|
110
|
+
# Validate quantize_and_serve configuration
|
|
111
|
+
self._validate_quantize_and_serve_config()
|
|
99
112
|
|
|
100
113
|
# Get hf config
|
|
101
114
|
self._maybe_pull_model_tokenizer_from_remote()
|
|
@@ -211,6 +224,8 @@ class ModelConfig:
|
|
|
211
224
|
quantization=server_args.quantization,
|
|
212
225
|
hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
|
|
213
226
|
model_impl=server_args.model_impl,
|
|
227
|
+
sampling_defaults=server_args.sampling_defaults,
|
|
228
|
+
quantize_and_serve=server_args.quantize_and_serve,
|
|
214
229
|
**kwargs,
|
|
215
230
|
)
|
|
216
231
|
|
|
@@ -477,31 +492,32 @@ class ModelConfig:
|
|
|
477
492
|
# example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
|
|
478
493
|
# example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
|
|
479
494
|
is_local = os.path.exists(self.model_path)
|
|
480
|
-
modelopt_quant_config = {"quant_method": "modelopt"}
|
|
481
495
|
if not is_local:
|
|
482
496
|
import huggingface_hub
|
|
483
497
|
|
|
484
498
|
try:
|
|
485
|
-
from huggingface_hub import HfApi
|
|
499
|
+
from huggingface_hub import HfApi, hf_hub_download
|
|
486
500
|
|
|
487
501
|
hf_api = HfApi()
|
|
488
|
-
|
|
489
|
-
def check_hf_quant_config():
|
|
490
|
-
return hf_api.file_exists(
|
|
491
|
-
self.model_path, "hf_quant_config.json"
|
|
492
|
-
)
|
|
493
|
-
|
|
494
502
|
# Retry HF API call up to 3 times
|
|
495
503
|
file_exists = retry(
|
|
496
|
-
|
|
504
|
+
lambda: hf_api.file_exists(
|
|
505
|
+
self.model_path, "hf_quant_config.json"
|
|
506
|
+
),
|
|
497
507
|
max_retry=2,
|
|
498
508
|
initial_delay=1.0,
|
|
499
509
|
max_delay=5.0,
|
|
500
510
|
)
|
|
501
|
-
|
|
502
511
|
if file_exists:
|
|
503
|
-
|
|
504
|
-
|
|
512
|
+
# Download and parse the quantization config for remote models
|
|
513
|
+
quant_config_file = hf_hub_download(
|
|
514
|
+
repo_id=self.model_path,
|
|
515
|
+
filename="hf_quant_config.json",
|
|
516
|
+
revision=self.revision,
|
|
517
|
+
)
|
|
518
|
+
with open(quant_config_file) as f:
|
|
519
|
+
quant_config_dict = json.load(f)
|
|
520
|
+
quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
|
|
505
521
|
except huggingface_hub.errors.OfflineModeIsEnabled:
|
|
506
522
|
logger.warning(
|
|
507
523
|
"Offline mode is enabled, skipping hf_quant_config.json check"
|
|
@@ -510,21 +526,79 @@ class ModelConfig:
|
|
|
510
526
|
logger.warning(
|
|
511
527
|
f"Failed to check hf_quant_config.json: {self.model_path} {e}"
|
|
512
528
|
)
|
|
513
|
-
|
|
514
529
|
elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
|
|
515
530
|
quant_config_file = os.path.join(
|
|
516
531
|
self.model_path, "hf_quant_config.json"
|
|
517
532
|
)
|
|
518
533
|
with open(quant_config_file) as f:
|
|
519
534
|
quant_config_dict = json.load(f)
|
|
520
|
-
|
|
521
|
-
quant_algo = json_quant_configs.get("quant_algo", None)
|
|
522
|
-
if quant_algo == "MIXED_PRECISION":
|
|
523
|
-
quant_cfg = {"quant_method": "w4afp8"}
|
|
524
|
-
else:
|
|
525
|
-
quant_cfg = modelopt_quant_config
|
|
535
|
+
quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
|
|
526
536
|
return quant_cfg
|
|
527
537
|
|
|
538
|
+
def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
|
|
539
|
+
"""Parse ModelOpt quantization config and return the appropriate quant_method."""
|
|
540
|
+
json_quant_configs = quant_config_dict["quantization"]
|
|
541
|
+
quant_algo = json_quant_configs.get("quant_algo", None)
|
|
542
|
+
|
|
543
|
+
if quant_algo == "MIXED_PRECISION":
|
|
544
|
+
return {"quant_method": "w4afp8"}
|
|
545
|
+
elif quant_algo and ("FP4" in quant_algo or "NVFP4" in quant_algo):
|
|
546
|
+
return {"quant_method": "modelopt_fp4"}
|
|
547
|
+
elif quant_algo and "FP8" in quant_algo:
|
|
548
|
+
return {"quant_method": "modelopt_fp8"}
|
|
549
|
+
else:
|
|
550
|
+
return None
|
|
551
|
+
|
|
552
|
+
def _is_already_quantized(self) -> bool:
|
|
553
|
+
"""Check if the model is already quantized based on config files."""
|
|
554
|
+
# Check for HuggingFace quantization config
|
|
555
|
+
from sglang.srt.utils import has_hf_quant_config
|
|
556
|
+
|
|
557
|
+
return has_hf_quant_config(self.model_path)
|
|
558
|
+
|
|
559
|
+
def _get_modelopt_quant_type(self) -> str:
|
|
560
|
+
"""Extract ModelOpt quantization type from unified quantization flag."""
|
|
561
|
+
if self.quantization == "modelopt_fp8":
|
|
562
|
+
return "fp8"
|
|
563
|
+
elif self.quantization == "modelopt_fp4":
|
|
564
|
+
return "nvfp4"
|
|
565
|
+
elif self.quantization == "modelopt":
|
|
566
|
+
# Auto-detect from model config
|
|
567
|
+
quant_cfg = self._parse_quant_hf_config()
|
|
568
|
+
if quant_cfg:
|
|
569
|
+
quant_method = quant_cfg.get("quant_method", "").lower()
|
|
570
|
+
if "fp4" in quant_method:
|
|
571
|
+
return "fp4"
|
|
572
|
+
elif "fp8" in quant_method:
|
|
573
|
+
return "fp8"
|
|
574
|
+
# Default to fp8 if can't detect
|
|
575
|
+
return "fp8"
|
|
576
|
+
else:
|
|
577
|
+
return "fp8" # Default fallback
|
|
578
|
+
|
|
579
|
+
def _validate_quantize_and_serve_config(self):
|
|
580
|
+
"""Validate quantize_and_serve configuration."""
|
|
581
|
+
if not self.quantize_and_serve:
|
|
582
|
+
return
|
|
583
|
+
|
|
584
|
+
# Check if ModelOpt quantization is specified
|
|
585
|
+
modelopt_quantization_specified = self.quantization in [
|
|
586
|
+
"modelopt",
|
|
587
|
+
"modelopt_fp8",
|
|
588
|
+
"modelopt_fp4",
|
|
589
|
+
]
|
|
590
|
+
|
|
591
|
+
if not modelopt_quantization_specified:
|
|
592
|
+
raise ValueError("quantize_and_serve requires ModelOpt quantization")
|
|
593
|
+
|
|
594
|
+
# quantize_and_serve is disabled due to compatibility issues
|
|
595
|
+
raise NotImplementedError(
|
|
596
|
+
"quantize_and_serve functionality is currently disabled due to compatibility issues. "
|
|
597
|
+
"Please use the separate quantize-then-deploy workflow instead. "
|
|
598
|
+
"Step 1: Quantize and export model. "
|
|
599
|
+
"Step 2: Deploy the exported model."
|
|
600
|
+
)
|
|
601
|
+
|
|
528
602
|
# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
|
|
529
603
|
def _verify_quantization(self) -> None:
|
|
530
604
|
supported_quantization = [*QUANTIZATION_METHODS]
|
|
@@ -543,7 +617,8 @@ class ModelConfig:
|
|
|
543
617
|
optimized_quantization_methods = [
|
|
544
618
|
"fp8",
|
|
545
619
|
"marlin",
|
|
546
|
-
"
|
|
620
|
+
"modelopt_fp8",
|
|
621
|
+
"modelopt_fp4",
|
|
547
622
|
"gptq_marlin_24",
|
|
548
623
|
"gptq_marlin",
|
|
549
624
|
"awq_marlin",
|
|
@@ -657,6 +732,38 @@ class ModelConfig:
|
|
|
657
732
|
eos_ids = eos_ids | generation_eos_ids
|
|
658
733
|
return eos_ids
|
|
659
734
|
|
|
735
|
+
def get_default_sampling_params(self) -> dict[str, Any]:
|
|
736
|
+
"""
|
|
737
|
+
Get default sampling parameters from the model's generation config.
|
|
738
|
+
|
|
739
|
+
This method returns non-default sampling parameters from the model's
|
|
740
|
+
generation_config.json when sampling_defaults is set to "model".
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
A dictionary containing the non-default sampling parameters.
|
|
744
|
+
"""
|
|
745
|
+
if self.sampling_defaults != "model":
|
|
746
|
+
return {}
|
|
747
|
+
|
|
748
|
+
if self.hf_generation_config is None:
|
|
749
|
+
return {}
|
|
750
|
+
|
|
751
|
+
config = self.hf_generation_config.to_dict()
|
|
752
|
+
|
|
753
|
+
available_params = [
|
|
754
|
+
"repetition_penalty",
|
|
755
|
+
"temperature",
|
|
756
|
+
"top_k",
|
|
757
|
+
"top_p",
|
|
758
|
+
"min_p",
|
|
759
|
+
]
|
|
760
|
+
|
|
761
|
+
default_sampling_params = {
|
|
762
|
+
p: config.get(p) for p in available_params if config.get(p) is not None
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
return default_sampling_params
|
|
766
|
+
|
|
660
767
|
def _maybe_pull_model_tokenizer_from_remote(self) -> None:
|
|
661
768
|
"""
|
|
662
769
|
Pull the model config files to a temporary
|
|
@@ -698,7 +805,7 @@ def _get_and_verify_dtype(
|
|
|
698
805
|
) -> torch.dtype:
|
|
699
806
|
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
|
700
807
|
# because config.torch_dtype can be None.
|
|
701
|
-
config_dtype = getattr(config, "
|
|
808
|
+
config_dtype = getattr(config, "dtype", None)
|
|
702
809
|
if isinstance(config_dtype, str):
|
|
703
810
|
config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
|
|
704
811
|
if config_dtype is None:
|
|
@@ -802,15 +909,19 @@ multimodal_model_archs = [
|
|
|
802
909
|
"Qwen2_5_VLForConditionalGeneration",
|
|
803
910
|
"Qwen3VLForConditionalGeneration",
|
|
804
911
|
"Qwen3VLMoeForConditionalGeneration",
|
|
912
|
+
"Qwen3OmniMoeForConditionalGeneration",
|
|
805
913
|
"KimiVLForConditionalGeneration",
|
|
806
914
|
"InternVLChatModel",
|
|
807
915
|
"InternS1ForConditionalGeneration",
|
|
808
916
|
"Phi4MMForCausalLM",
|
|
809
|
-
"VILAForConditionalGeneration",
|
|
810
917
|
"Step3VLForConditionalGeneration",
|
|
918
|
+
"POINTSV15ChatModel",
|
|
811
919
|
"DotsVLMForCausalLM",
|
|
812
920
|
"DotsOCRForCausalLM",
|
|
813
921
|
"Sarashina2VisionForCausalLM",
|
|
922
|
+
"NVILAForConditionalGeneration",
|
|
923
|
+
"NVILALiteForConditionalGeneration",
|
|
924
|
+
"DeepseekOCRForCausalLM",
|
|
814
925
|
]
|
|
815
926
|
|
|
816
927
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Configuration for NVIDIA ModelOpt quantization integration
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class ModelOptConfig:
|
|
8
|
+
"""Configuration for NVIDIA ModelOpt quantization operations.
|
|
9
|
+
|
|
10
|
+
This configuration class holds parameters for ModelOpt quantization,
|
|
11
|
+
checkpoint management, and model export operations.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
quant: Quantization method/type (e.g., "fp8", "fp4")
|
|
15
|
+
checkpoint_restore_path: Path to restore ModelOpt checkpoint from
|
|
16
|
+
checkpoint_save_path: Path to save ModelOpt checkpoint to
|
|
17
|
+
export_path: Path to export quantized model in HuggingFace format
|
|
18
|
+
quantize_and_serve: Whether to quantize and serve in one step
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
quant: Optional[str] = None
|
|
22
|
+
checkpoint_restore_path: Optional[str] = None
|
|
23
|
+
checkpoint_save_path: Optional[str] = None
|
|
24
|
+
export_path: Optional[str] = None
|
|
25
|
+
quantize_and_serve: bool = False
|
|
26
|
+
|
|
27
|
+
def __post_init__(self):
|
|
28
|
+
"""Validate configuration after initialization."""
|
|
29
|
+
# Add any validation logic if needed
|
|
30
|
+
pass
|