sglang 0.4.5.post3__tar.gz → 0.4.6.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.5.post3/sglang.egg-info → sglang-0.4.6.post1}/PKG-INFO +5 -6
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/README.md +2 -2
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/pyproject.toml +3 -4
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_one_batch.py +19 -3
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_serving.py +8 -9
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/compile_deep_gemm.py +45 -4
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/code_completion_parser.py +1 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/deepseekvl2.py +1 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/model_config.py +9 -3
- sglang-0.4.6.post1/sglang/srt/constrained/llguidance_backend.py +169 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/conversation.py +34 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/decode.py +67 -13
- sglang-0.4.6.post1/sglang/srt/disaggregation/fake/__init__.py +1 -0
- sglang-0.4.6.post1/sglang/srt/disaggregation/fake/conn.py +88 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mini_lb.py +45 -8
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mooncake/conn.py +198 -31
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/prefill.py +36 -12
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/utils.py +16 -2
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/engine.py +9 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/http_server.py +35 -4
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/function_call_parser.py +77 -5
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/base_attn_backend.py +3 -0
- sglang-0.4.6.post1/sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashattention_backend.py +28 -10
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashmla_backend.py +8 -11
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/utils.py +1 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/vision.py +2 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/layernorm.py +38 -16
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/logits_processor.py +2 -2
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_native.py +2 -4
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
- sglang-0.4.5.post3/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +35 -35
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +20 -17
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/pooler.py +6 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/awq.py +5 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/deep_gemm.py +17 -10
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/fp8.py +20 -22
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/fp8_utils.py +2 -2
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/int8_kernel.py +32 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/radix_attention.py +13 -3
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/rotary_embedding.py +170 -126
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/data_parallel_controller.py +10 -3
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/io_struct.py +7 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/mm_utils.py +85 -28
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/schedule_batch.py +38 -12
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/scheduler.py +41 -28
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/tokenizer_manager.py +5 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/tp_worker.py +3 -3
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/memory_pool.py +87 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_executor/cuda_graph_runner.py +4 -3
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_executor/forward_batch_info.py +51 -95
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_executor/model_runner.py +19 -25
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek.py +12 -2
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_nextn.py +101 -6
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_v2.py +144 -70
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_vl2.py +9 -4
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma3_causal.py +1 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama4.py +0 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/minicpmo.py +5 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mllama4.py +2 -2
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_5_vl.py +3 -6
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_vl.py +3 -7
- sglang-0.4.6.post1/sglang/srt/models/roberta.py +178 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/openai_api/adapter.py +50 -11
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/openai_api/protocol.py +2 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/reasoning_parser.py +25 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/server_args.py +31 -24
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/torch_memory_saver_adapter.py +10 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/utils.py +5 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/runners.py +6 -13
- sglang-0.4.6.post1/sglang/test/send_one.py +144 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_utils.py +74 -18
- sglang-0.4.6.post1/sglang/version.py +1 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1/sglang.egg-info}/PKG-INFO +5 -6
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang.egg-info/SOURCES.txt +17 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang.egg-info/requires.txt +2 -2
- sglang-0.4.5.post3/sglang/srt/constrained/llguidance_backend.py +0 -152
- sglang-0.4.5.post3/sglang/test/send_one.py +0 -88
- sglang-0.4.5.post3/sglang/version.py +0 -1
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/LICENSE +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/setup.cfg +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/api.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_offline_throughput.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/check_env.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/global_config.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/chat_template.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/ir.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/launch_server.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/llama3_eval.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/_custom_ops.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/chatglm.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/dbrx.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/janus_pro.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/base_connector.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/redis.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/s3.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/serde/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/serde/safe_serde.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/serde/serde.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/xgrammar_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/custom_op.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/base/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/base/conn.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/nixl/conn.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/parallel_state.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/EngineBase.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/http_server_engine.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/verl_engine.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/dp_attention.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/elementwise.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/router.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/topk.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/parameter.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/gptq.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/int8_utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/kv_cache.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/backend/base_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/backend/triton_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/layers.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/mem_pool.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/cache_controller.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/configure_logging.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/expert_distribution.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processor.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/mllama4.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/paged_allocator.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_loader/loader.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_loader/weight_utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/bert.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/clip.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_janus_pro.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma3_mm.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/granite.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/grok.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_eagle.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_eagle3.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llava.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/minicpmv.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/olmo2.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/phi3_small.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_classification.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_rm.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen3.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen3_moe.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/patch_torch.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/platforms/interface.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/eagle_utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/eagle_worker.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/spec_info.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/warmup.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/attention/__init__.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/attention/test_flashattn_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_block_fp8.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_block_fp8_ep.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_custom_ops.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_dynamic_grad_mode.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_programs.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/utils.py +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.6.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -225,7 +225,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
|
|
225
225
|
Requires-Dist: hf_transfer; extra == "runtime-common"
|
226
226
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
227
227
|
Requires-Dist: interegular; extra == "runtime-common"
|
228
|
-
Requires-Dist: llguidance
|
228
|
+
Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
|
229
229
|
Requires-Dist: modelscope; extra == "runtime-common"
|
230
230
|
Requires-Dist: ninja; extra == "runtime-common"
|
231
231
|
Requires-Dist: orjson; extra == "runtime-common"
|
@@ -242,11 +242,10 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
|
242
242
|
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
243
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
244
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
|
-
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
246
245
|
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
247
246
|
Provides-Extra: srt
|
248
247
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
249
|
-
Requires-Dist: sgl-kernel==0.0
|
248
|
+
Requires-Dist: sgl-kernel==0.1.0; extra == "srt"
|
250
249
|
Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
|
251
250
|
Requires-Dist: torch==2.6.0; extra == "srt"
|
252
251
|
Requires-Dist: torchvision==0.21.0; extra == "srt"
|
@@ -409,5 +408,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
|
|
409
408
|
|
410
409
|
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
411
410
|
|
412
|
-
## Acknowledgment
|
413
|
-
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
411
|
+
## Acknowledgment
|
412
|
+
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -71,5 +71,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
|
|
71
71
|
|
72
72
|
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
73
73
|
|
74
|
-
## Acknowledgment
|
75
|
-
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
74
|
+
## Acknowledgment
|
75
|
+
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.
|
7
|
+
version = "0.4.6.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -24,7 +24,7 @@ runtime_common = [
|
|
24
24
|
"hf_transfer",
|
25
25
|
"huggingface_hub",
|
26
26
|
"interegular",
|
27
|
-
"llguidance>=0.
|
27
|
+
"llguidance>=0.7.11,<0.8.0",
|
28
28
|
"modelscope",
|
29
29
|
"ninja",
|
30
30
|
"orjson",
|
@@ -41,13 +41,12 @@ runtime_common = [
|
|
41
41
|
"transformers==4.51.1",
|
42
42
|
"uvicorn",
|
43
43
|
"uvloop",
|
44
|
-
"compressed-tensors",
|
45
44
|
"xgrammar==0.1.17",
|
46
45
|
]
|
47
46
|
|
48
47
|
srt = [
|
49
48
|
"sglang[runtime_common]",
|
50
|
-
"sgl-kernel==0.0
|
49
|
+
"sgl-kernel==0.1.0",
|
51
50
|
"flashinfer_python==0.2.3",
|
52
51
|
"torch==2.6.0",
|
53
52
|
"torchvision==0.21.0",
|
@@ -57,6 +57,7 @@ import torch
|
|
57
57
|
import torch.distributed as dist
|
58
58
|
|
59
59
|
from sglang.srt.configs.model_config import ModelConfig
|
60
|
+
from sglang.srt.distributed.parallel_state import destroy_distributed_environment
|
60
61
|
from sglang.srt.entrypoints.engine import _set_envs_and_config
|
61
62
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
62
63
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
@@ -85,6 +86,7 @@ class BenchArgs:
|
|
85
86
|
correctness_test: bool = False
|
86
87
|
# This is only used for correctness test
|
87
88
|
cut_len: int = 4
|
89
|
+
log_decode_step: int = 0
|
88
90
|
profile: bool = False
|
89
91
|
profile_filename_prefix: str = "profile"
|
90
92
|
|
@@ -105,6 +107,12 @@ class BenchArgs:
|
|
105
107
|
)
|
106
108
|
parser.add_argument("--correctness-test", action="store_true")
|
107
109
|
parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
|
110
|
+
parser.add_argument(
|
111
|
+
"--log-decode-step",
|
112
|
+
type=int,
|
113
|
+
default=BenchArgs.log_decode_step,
|
114
|
+
help="Log decode latency by step, default is set to zero to disable.",
|
115
|
+
)
|
108
116
|
parser.add_argument(
|
109
117
|
"--profile", action="store_true", help="Use Torch Profiler."
|
110
118
|
)
|
@@ -335,6 +343,7 @@ def latency_test_run_once(
|
|
335
343
|
input_len,
|
336
344
|
output_len,
|
337
345
|
device,
|
346
|
+
log_decode_step,
|
338
347
|
profile,
|
339
348
|
profile_filename_prefix,
|
340
349
|
):
|
@@ -394,9 +403,9 @@ def latency_test_run_once(
|
|
394
403
|
tot_latency += latency
|
395
404
|
throughput = batch_size / latency
|
396
405
|
decode_latencies.append(latency)
|
397
|
-
if i < 5:
|
406
|
+
if i < 5 or (log_decode_step > 0 and i % log_decode_step == 0):
|
398
407
|
rank_print(
|
399
|
-
f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
408
|
+
f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
400
409
|
)
|
401
410
|
|
402
411
|
if profile:
|
@@ -457,8 +466,9 @@ def latency_test(
|
|
457
466
|
reqs,
|
458
467
|
bench_args.batch_size[0],
|
459
468
|
bench_args.input_len[0],
|
460
|
-
|
469
|
+
min(32, bench_args.output_len[0]), # shorter decoding to speed up the warmup
|
461
470
|
server_args.device,
|
471
|
+
log_decode_step=0,
|
462
472
|
profile=False,
|
463
473
|
profile_filename_prefix="", # not used
|
464
474
|
)
|
@@ -480,6 +490,7 @@ def latency_test(
|
|
480
490
|
il,
|
481
491
|
ol,
|
482
492
|
server_args.device,
|
493
|
+
bench_args.log_decode_step,
|
483
494
|
bench_args.profile if tp_rank == 0 else None,
|
484
495
|
bench_args.profile_filename_prefix,
|
485
496
|
)
|
@@ -492,8 +503,13 @@ def latency_test(
|
|
492
503
|
for result in result_list:
|
493
504
|
fout.write(json.dumps(result) + "\n")
|
494
505
|
|
506
|
+
if server_args.tp_size > 1:
|
507
|
+
destroy_distributed_environment()
|
508
|
+
|
495
509
|
|
496
510
|
def main(server_args, bench_args):
|
511
|
+
server_args.cuda_graph_max_bs = max(bench_args.batch_size)
|
512
|
+
|
497
513
|
_set_envs_and_config(server_args)
|
498
514
|
|
499
515
|
if server_args.model_path:
|
@@ -295,7 +295,7 @@ async def async_request_truss(
|
|
295
295
|
# NOTE: Some completion API might have a last
|
296
296
|
# usage summary response without a token so we
|
297
297
|
# want to check a token was generated
|
298
|
-
if data["choices"][0]["
|
298
|
+
if data["choices"][0]["text"]:
|
299
299
|
timestamp = time.perf_counter()
|
300
300
|
# First token
|
301
301
|
if ttft == 0.0:
|
@@ -307,7 +307,7 @@ async def async_request_truss(
|
|
307
307
|
output.itl.append(timestamp - most_recent_timestamp)
|
308
308
|
|
309
309
|
most_recent_timestamp = timestamp
|
310
|
-
generated_text += data["choices"][0]["
|
310
|
+
generated_text += data["choices"][0]["text"]
|
311
311
|
|
312
312
|
output.generated_text = generated_text
|
313
313
|
output.success = True
|
@@ -977,6 +977,7 @@ async def benchmark(
|
|
977
977
|
profile: bool,
|
978
978
|
pd_seperated: bool = False,
|
979
979
|
flush_cache: bool = False,
|
980
|
+
warmup_requests: int = 1,
|
980
981
|
):
|
981
982
|
if backend in ASYNC_REQUEST_FUNCS:
|
982
983
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -994,11 +995,11 @@ async def benchmark(
|
|
994
995
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
995
996
|
|
996
997
|
# Warmup
|
997
|
-
print(f"Starting warmup with {
|
998
|
+
print(f"Starting warmup with {warmup_requests} sequences...")
|
998
999
|
|
999
1000
|
# Use the first request for all warmup iterations
|
1000
1001
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
1001
|
-
if lora_names
|
1002
|
+
if lora_names is not None and len(lora_names) != 0:
|
1002
1003
|
lora_name = lora_names[0]
|
1003
1004
|
else:
|
1004
1005
|
lora_name = None
|
@@ -1016,7 +1017,7 @@ async def benchmark(
|
|
1016
1017
|
|
1017
1018
|
# Run warmup requests
|
1018
1019
|
warmup_tasks = []
|
1019
|
-
for _ in range(
|
1020
|
+
for _ in range(warmup_requests):
|
1020
1021
|
warmup_tasks.append(
|
1021
1022
|
asyncio.create_task(request_func(request_func_input=test_input))
|
1022
1023
|
)
|
@@ -1024,9 +1025,7 @@ async def benchmark(
|
|
1024
1025
|
warmup_outputs = await asyncio.gather(*warmup_tasks)
|
1025
1026
|
|
1026
1027
|
# Check if at least one warmup request succeeded
|
1027
|
-
if
|
1028
|
-
output.success for output in warmup_outputs
|
1029
|
-
):
|
1028
|
+
if warmup_requests > 0 and not any(output.success for output in warmup_outputs):
|
1030
1029
|
raise ValueError(
|
1031
1030
|
"Warmup failed - Please make sure benchmark arguments "
|
1032
1031
|
f"are correctly specified. Error: {warmup_outputs[0].error}"
|
@@ -1058,7 +1057,7 @@ async def benchmark(
|
|
1058
1057
|
tasks: List[asyncio.Task] = []
|
1059
1058
|
async for request in get_request(input_requests, request_rate):
|
1060
1059
|
prompt, prompt_len, output_len = request
|
1061
|
-
if lora_names
|
1060
|
+
if lora_names is not None and len(lora_names) != 0:
|
1062
1061
|
idx = random.randint(0, len(lora_names) - 1)
|
1063
1062
|
lora_name = lora_names[idx]
|
1064
1063
|
else:
|
@@ -27,7 +27,11 @@ from sglang.srt.warmup import warmup
|
|
27
27
|
multiprocessing.set_start_method("spawn", force=True)
|
28
28
|
|
29
29
|
# Reduce warning
|
30
|
-
os.environ["
|
30
|
+
os.environ["SGL_IN_DEEPGEMM_PRECOMPILE_STAGE"] = "1"
|
31
|
+
# Force enable deep gemm
|
32
|
+
os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "1"
|
33
|
+
# Force enable mha chunked kv for DeepSeek V3 to avoid missing kv_b_proj DeepGEMM case
|
34
|
+
os.environ["SGL_CHUNKED_PREFIX_CACHE_THRESHOLD"] = "0"
|
31
35
|
|
32
36
|
|
33
37
|
@dataclasses.dataclass
|
@@ -84,8 +88,36 @@ def launch_server_process_and_send_one_request(
|
|
84
88
|
headers = {
|
85
89
|
"Content-Type": "application/json; charset=utf-8",
|
86
90
|
}
|
87
|
-
|
91
|
+
if server_args.node_rank == 0:
|
92
|
+
response = requests.get(f"{base_url}/v1/models", headers=headers)
|
93
|
+
else:
|
94
|
+
# This http api is created by launch_dummy_health_check_server for none-rank0 node.
|
95
|
+
response = requests.get(f"{base_url}/health", headers=headers)
|
88
96
|
if response.status_code == 200:
|
97
|
+
# Rank-0 node send a request to sync with other node and then return.
|
98
|
+
if server_args.node_rank == 0:
|
99
|
+
response = requests.post(
|
100
|
+
f"{base_url}/generate",
|
101
|
+
json={
|
102
|
+
"input_ids": [0, 1, 2, 3],
|
103
|
+
"sampling_params": {
|
104
|
+
"max_new_tokens": 8,
|
105
|
+
"temperature": 0,
|
106
|
+
},
|
107
|
+
},
|
108
|
+
timeout=600,
|
109
|
+
)
|
110
|
+
if response.status_code != 200:
|
111
|
+
error = response.json()
|
112
|
+
raise RuntimeError(f"Sync request failed: {error}")
|
113
|
+
# Other nodes should wait for the exit signal from Rank-0 node.
|
114
|
+
else:
|
115
|
+
start_time_waiting = time.time()
|
116
|
+
while proc.is_alive():
|
117
|
+
if time.time() - start_time_waiting < timeout:
|
118
|
+
time.sleep(10)
|
119
|
+
else:
|
120
|
+
raise TimeoutError("Waiting for main node timeout!")
|
89
121
|
return proc
|
90
122
|
except requests.RequestException:
|
91
123
|
pass
|
@@ -118,10 +150,19 @@ def run_compile(server_args: ServerArgs, compile_args: CompileArgs):
|
|
118
150
|
|
119
151
|
proc = launch_server_process_and_send_one_request(server_args, compile_args)
|
120
152
|
|
121
|
-
kill_process_tree(proc.pid)
|
122
|
-
|
123
153
|
print("\nDeepGEMM Kernels compilation finished successfully.")
|
124
154
|
|
155
|
+
# Sleep for safety
|
156
|
+
time.sleep(10)
|
157
|
+
if proc.is_alive():
|
158
|
+
# This is the rank0 node.
|
159
|
+
kill_process_tree(proc.pid)
|
160
|
+
else:
|
161
|
+
try:
|
162
|
+
kill_process_tree(proc.pid)
|
163
|
+
except Exception:
|
164
|
+
pass
|
165
|
+
|
125
166
|
|
126
167
|
if __name__ == "__main__":
|
127
168
|
parser = argparse.ArgumentParser()
|
@@ -113,7 +113,7 @@ def completion_template_exists(template_name: str) -> bool:
|
|
113
113
|
|
114
114
|
def is_completion_template_defined() -> bool:
|
115
115
|
global completion_template_name
|
116
|
-
return completion_template_name
|
116
|
+
return completion_template_name is not None
|
117
117
|
|
118
118
|
|
119
119
|
def generate_completion_prompt_from_request(request: ChatCompletionRequest) -> str:
|
@@ -182,7 +182,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|
182
182
|
tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
|
183
183
|
messages,
|
184
184
|
pil_images[image_index : image_index + image_token_cnt],
|
185
|
-
bos=
|
185
|
+
bos=True,
|
186
186
|
eos=True,
|
187
187
|
cropping=len(pil_images) <= 2,
|
188
188
|
max_req_input_len=max_req_input_len,
|
@@ -73,10 +73,14 @@ class ModelConfig:
|
|
73
73
|
)
|
74
74
|
|
75
75
|
if enable_multimodal is None:
|
76
|
-
|
76
|
+
mm_disabled_models = [
|
77
|
+
"Gemma3ForConditionalGeneration",
|
78
|
+
"Llama4ForConditionalGeneration",
|
79
|
+
]
|
80
|
+
if self.hf_config.architectures[0] in mm_disabled_models:
|
77
81
|
enable_multimodal = False
|
78
82
|
logger.info(
|
79
|
-
"Multimodal is disabled for
|
83
|
+
f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
|
80
84
|
)
|
81
85
|
else:
|
82
86
|
enable_multimodal = True
|
@@ -158,7 +162,9 @@ class ModelConfig:
|
|
158
162
|
self.attention_arch = AttentionArch.MLA
|
159
163
|
self.kv_lora_rank = self.hf_config.kv_lora_rank
|
160
164
|
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
|
161
|
-
elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures
|
165
|
+
elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr(
|
166
|
+
self.hf_text_config, "use_mla", True
|
167
|
+
):
|
162
168
|
self.head_dim = 256
|
163
169
|
self.attention_arch = AttentionArch.MLA
|
164
170
|
self.kv_lora_rank = self.hf_text_config.kv_lora_rank
|
@@ -0,0 +1,169 @@
|
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
"""Constrained decoding with llguidance backend."""
|
15
|
+
|
16
|
+
import json
|
17
|
+
import logging
|
18
|
+
import os
|
19
|
+
from typing import List, Optional, Tuple
|
20
|
+
|
21
|
+
import torch
|
22
|
+
from llguidance import LLMatcher, LLTokenizer, StructTag, grammar_from
|
23
|
+
from llguidance.hf import from_tokenizer
|
24
|
+
from llguidance.torch import (
|
25
|
+
allocate_token_bitmask,
|
26
|
+
apply_token_bitmask_inplace,
|
27
|
+
fill_next_token_bitmask,
|
28
|
+
)
|
29
|
+
|
30
|
+
from sglang.srt.constrained.base_grammar_backend import (
|
31
|
+
BaseGrammarBackend,
|
32
|
+
BaseGrammarObject,
|
33
|
+
)
|
34
|
+
|
35
|
+
logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
|
38
|
+
class GuidanceGrammar(BaseGrammarObject):
|
39
|
+
|
40
|
+
def __init__(self, llguidance_tokenizer: LLTokenizer, serialized_grammar: str):
|
41
|
+
super().__init__()
|
42
|
+
self.llguidance_tokenizer = llguidance_tokenizer
|
43
|
+
self.serialized_grammar = serialized_grammar
|
44
|
+
|
45
|
+
self.ll_matcher = LLMatcher(
|
46
|
+
self.llguidance_tokenizer,
|
47
|
+
self.serialized_grammar,
|
48
|
+
log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
|
49
|
+
)
|
50
|
+
self.finished = False
|
51
|
+
self.bitmask = None
|
52
|
+
|
53
|
+
def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
|
54
|
+
ff_tokens = self.ll_matcher.compute_ff_tokens()
|
55
|
+
if ff_tokens:
|
56
|
+
return ff_tokens, ""
|
57
|
+
else:
|
58
|
+
return None
|
59
|
+
|
60
|
+
def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
|
61
|
+
return "", -1
|
62
|
+
|
63
|
+
def jump_and_retokenize(
|
64
|
+
self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
|
65
|
+
):
|
66
|
+
pass
|
67
|
+
|
68
|
+
def accept_token(self, token: int):
|
69
|
+
if not self.ll_matcher.consume_token(token):
|
70
|
+
logger.warning(f"matcher error: {self.ll_matcher.get_error()}")
|
71
|
+
self.finished = True
|
72
|
+
|
73
|
+
def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
|
74
|
+
if self.ll_matcher.is_stopped():
|
75
|
+
self.finished = True
|
76
|
+
|
77
|
+
fill_next_token_bitmask(self.ll_matcher, vocab_mask, idx)
|
78
|
+
|
79
|
+
def allocate_vocab_mask(
|
80
|
+
self, vocab_size: int, batch_size: int, device
|
81
|
+
) -> torch.Tensor:
|
82
|
+
if self.bitmask is None or self.bitmask.shape[0] < batch_size:
|
83
|
+
# only create bitmask when batch gets larger
|
84
|
+
self.bitmask = allocate_token_bitmask(
|
85
|
+
batch_size, self.llguidance_tokenizer.vocab_size
|
86
|
+
)
|
87
|
+
bitmask = self.bitmask
|
88
|
+
else:
|
89
|
+
bitmask = self.bitmask[:batch_size]
|
90
|
+
|
91
|
+
return bitmask
|
92
|
+
|
93
|
+
@staticmethod
|
94
|
+
def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
|
95
|
+
return vocab_mask.to(device, non_blocking=True)
|
96
|
+
|
97
|
+
@staticmethod
|
98
|
+
def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
|
99
|
+
apply_token_bitmask_inplace(logits, vocab_mask)
|
100
|
+
|
101
|
+
def copy(self):
|
102
|
+
return GuidanceGrammar(
|
103
|
+
llguidance_tokenizer=self.llguidance_tokenizer,
|
104
|
+
serialized_grammar=self.serialized_grammar,
|
105
|
+
)
|
106
|
+
|
107
|
+
|
108
|
+
class GuidanceBackend(BaseGrammarBackend):
|
109
|
+
|
110
|
+
def __init__(
|
111
|
+
self,
|
112
|
+
tokenizer,
|
113
|
+
whitespace_pattern: Optional[str] = None,
|
114
|
+
n_vocab: Optional[int] = None,
|
115
|
+
):
|
116
|
+
super().__init__()
|
117
|
+
|
118
|
+
self.tokenizer = tokenizer
|
119
|
+
self.whitespace_pattern = whitespace_pattern
|
120
|
+
self.llguidance_tokenizer = from_tokenizer(self.tokenizer, n_vocab)
|
121
|
+
|
122
|
+
def _from_serialized(self, serialized_grammar) -> Optional[GuidanceGrammar]:
|
123
|
+
try:
|
124
|
+
return GuidanceGrammar(
|
125
|
+
llguidance_tokenizer=self.llguidance_tokenizer,
|
126
|
+
serialized_grammar=serialized_grammar,
|
127
|
+
)
|
128
|
+
except Exception as e:
|
129
|
+
logger.warning(f"Skip invalid grammar: {serialized_grammar}, {e=}")
|
130
|
+
return None
|
131
|
+
|
132
|
+
def dispatch_json(self, key_string: str) -> Optional[GuidanceGrammar]:
|
133
|
+
serialized_grammar = LLMatcher.grammar_from_json_schema(
|
134
|
+
key_string,
|
135
|
+
defaults={
|
136
|
+
"whitespace_pattern": self.whitespace_pattern,
|
137
|
+
},
|
138
|
+
)
|
139
|
+
return self._from_serialized(serialized_grammar)
|
140
|
+
|
141
|
+
def dispatch_regex(self, key_string: str) -> Optional[GuidanceGrammar]:
|
142
|
+
serialized_grammar = grammar_from("regex", key_string)
|
143
|
+
return self._from_serialized(serialized_grammar)
|
144
|
+
|
145
|
+
def dispatch_ebnf(self, key_string: str) -> Optional[GuidanceGrammar]:
|
146
|
+
try:
|
147
|
+
serialized_grammar = grammar_from("ebnf", key_string)
|
148
|
+
return self._from_serialized(serialized_grammar)
|
149
|
+
except ValueError as e:
|
150
|
+
logger.warning(f"Skip invalid ebnf: regex={key_string}, {e=}")
|
151
|
+
return None
|
152
|
+
|
153
|
+
def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
|
154
|
+
try:
|
155
|
+
structural_tag = json.loads(key_string)
|
156
|
+
tags = [
|
157
|
+
StructTag(
|
158
|
+
begin=structure["begin"],
|
159
|
+
grammar=structure["schema"],
|
160
|
+
end=structure["end"],
|
161
|
+
trigger=structural_tag["triggers"][0], # TODO?
|
162
|
+
)
|
163
|
+
for structure in structural_tag["structures"]
|
164
|
+
]
|
165
|
+
g = StructTag.to_grammar(tags)
|
166
|
+
return self._from_serialized(g)
|
167
|
+
except Exception as e:
|
168
|
+
logging.warning(f"Skip invalid structural_tag: {key_string}, {e=}")
|
169
|
+
return None
|
@@ -463,6 +463,30 @@ def generate_embedding_convs(
|
|
463
463
|
return convs
|
464
464
|
|
465
465
|
|
466
|
+
# Models in which system adds modality tokens at prompt start automatically
|
467
|
+
# when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 <image> tokens)
|
468
|
+
_MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"}
|
469
|
+
|
470
|
+
|
471
|
+
# adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856
|
472
|
+
def _get_full_multimodal_text_prompt(
|
473
|
+
modality_token: str, modality_count: int, text_prompt: str
|
474
|
+
) -> str:
|
475
|
+
"""Combine multimodal prompts for a multimodal language model."""
|
476
|
+
|
477
|
+
# For any existing placeholder in the text prompt, we leave it as is
|
478
|
+
left: int = modality_count - text_prompt.count(modality_token)
|
479
|
+
if left < 0:
|
480
|
+
raise ValueError(
|
481
|
+
f"Found more '{modality_token}' placeholders in input prompt than "
|
482
|
+
"actual multimodal data items."
|
483
|
+
)
|
484
|
+
|
485
|
+
# NOTE: For now we always add missing modality_token at the front of
|
486
|
+
# the prompt. This may change to be customizable in the future.
|
487
|
+
return "\n".join([modality_token] * left + [text_prompt])
|
488
|
+
|
489
|
+
|
466
490
|
def generate_chat_conv(
|
467
491
|
request: ChatCompletionRequest, template_name: str
|
468
492
|
) -> Conversation:
|
@@ -520,6 +544,12 @@ def generate_chat_conv(
|
|
520
544
|
if conv.name != "qwen2-vl"
|
521
545
|
else conv.image_token
|
522
546
|
)
|
547
|
+
add_token_as_needed: bool = (
|
548
|
+
conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
|
549
|
+
)
|
550
|
+
if add_token_as_needed:
|
551
|
+
image_token = ""
|
552
|
+
|
523
553
|
audio_token = conv.audio_token
|
524
554
|
for content in message.content:
|
525
555
|
if content.type == "text":
|
@@ -533,7 +563,10 @@ def generate_chat_conv(
|
|
533
563
|
elif content.type == "audio_url":
|
534
564
|
real_content += audio_token
|
535
565
|
conv.append_audio(content.audio_url.url)
|
536
|
-
|
566
|
+
if add_token_as_needed:
|
567
|
+
real_content = _get_full_multimodal_text_prompt(
|
568
|
+
conv.image_token, num_image_url, real_content
|
569
|
+
)
|
537
570
|
conv.append_message(conv.roles[0], real_content)
|
538
571
|
elif msg_role == "assistant":
|
539
572
|
parsed_content = ""
|