sglang 0.4.5__tar.gz → 0.4.5.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.5/sglang.egg-info → sglang-0.4.5.post1}/PKG-INFO +14 -4
- {sglang-0.4.5 → sglang-0.4.5.post1}/README.md +1 -1
- {sglang-0.4.5 → sglang-0.4.5.post1}/pyproject.toml +15 -3
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/bench_one_batch.py +21 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/bench_serving.py +10 -4
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/model_config.py +37 -5
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/base_grammar_backend.py +26 -5
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/llguidance_backend.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/outlines_backend.py +1 -0
- sglang-0.4.5.post1/sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/xgrammar_backend.py +1 -0
- sglang-0.4.5.post1/sglang/srt/disaggregation/base/__init__.py +8 -0
- sglang-0.4.5.post1/sglang/srt/disaggregation/base/conn.py +113 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/disaggregation/decode.py +18 -5
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/disaggregation/mini_lb.py +53 -122
- sglang-0.4.5.post1/sglang/srt/disaggregation/mooncake/__init__.py +6 -0
- sglang-0.4.5.post1/sglang/srt/disaggregation/mooncake/conn.py +615 -0
- sglang-0.4.5.post1/sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/disaggregation/prefill.py +43 -19
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/disaggregation/utils.py +31 -0
- sglang-0.4.5.post1/sglang/srt/entrypoints/EngineBase.py +53 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/entrypoints/engine.py +36 -8
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/entrypoints/http_server.py +37 -8
- sglang-0.4.5.post1/sglang/srt/entrypoints/http_server_engine.py +142 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/entrypoints/verl_engine.py +37 -10
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/hf_transformers_utils.py +4 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashattention_backend.py +330 -200
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashinfer_backend.py +13 -7
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/vision.py +1 -1
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/dp_attention.py +2 -4
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/elementwise.py +15 -2
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/linear.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
- sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +38 -21
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/router.py +7 -1
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/topk.py +37 -16
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/__init__.py +12 -5
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/fp8.py +25 -13
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/fp8_kernel.py +130 -4
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/fp8_utils.py +34 -6
- sglang-0.4.5.post1/sglang/srt/layers/quantization/kv_cache.py +89 -0
- sglang-0.4.5.post1/sglang/srt/layers/quantization/modelopt_quant.py +463 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/w8a8_int8.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/radix_attention.py +13 -1
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/rotary_embedding.py +12 -1
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/io_struct.py +254 -97
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/mm_utils.py +3 -2
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/schedule_batch.py +62 -21
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/scheduler.py +71 -14
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/tokenizer_manager.py +17 -3
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/tp_worker.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/memory_pool.py +14 -1
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/metrics/collector.py +9 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_executor/cuda_graph_runner.py +7 -4
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_executor/forward_batch_info.py +234 -15
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_executor/model_runner.py +48 -9
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_loader/loader.py +31 -4
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_loader/weight_utils.py +4 -2
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/baichuan.py +2 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/chatglm.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/commandr.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/dbrx.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_v2.py +248 -61
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/exaone.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma2.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma3_causal.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gpt2.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gpt_bigcode.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/granite.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/grok.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/internlm2.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama4.py +101 -34
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/minicpm.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/minicpm3.py +2 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/mixtral.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/mixtral_quant.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/mllama.py +51 -8
- sglang-0.4.5.post1/sglang/srt/models/mllama4.py +227 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/olmo.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/olmo2.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/olmoe.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/phi3_small.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_5_vl.py +35 -70
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_moe.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_vl.py +27 -25
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/stablelm.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/xverse.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/xverse_moe.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/openai_api/adapter.py +4 -1
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/patch_torch.py +11 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/server_args.py +34 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/eagle_utils.py +1 -11
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/eagle_worker.py +6 -2
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/utils.py +120 -9
- sglang-0.4.5.post1/sglang/test/attention/test_flashattn_backend.py +350 -0
- sglang-0.4.5.post1/sglang/test/attention/test_flashattn_mla_backend.py +285 -0
- sglang-0.4.5.post1/sglang/test/attention/test_prefix_chunk_info.py +224 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_block_fp8.py +57 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_utils.py +19 -8
- sglang-0.4.5.post1/sglang/version.py +1 -0
- {sglang-0.4.5 → sglang-0.4.5.post1/sglang.egg-info}/PKG-INFO +14 -4
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang.egg-info/SOURCES.txt +17 -3
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang.egg-info/requires.txt +13 -2
- sglang-0.4.5/sglang/srt/disaggregation/conn.py +0 -81
- sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -146
- sglang-0.4.5/sglang/srt/layers/quantization/kv_cache.py +0 -98
- sglang-0.4.5/sglang/srt/layers/quantization/modelopt_quant.py +0 -196
- sglang-0.4.5/sglang/srt/models/mllama4.py +0 -154
- sglang-0.4.5/sglang/test/attention/test_flashattn_backend.py +0 -312
- sglang-0.4.5/sglang/version.py +0 -1
- {sglang-0.4.5 → sglang-0.4.5.post1}/LICENSE +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/setup.cfg +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/api.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/bench_offline_throughput.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/check_env.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/global_config.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/chat_template.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/ir.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/launch_server.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/llama3_eval.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/_custom_ops.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/code_completion_parser.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/chatglm.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/dbrx.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/deepseekvl2.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/janus_pro.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/base_connector.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/redis.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/s3.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/serde/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/serde/safe_serde.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/serde/serde.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/conversation.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/custom_op.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/parallel_state.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/function_call_parser.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/parameter.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/awq.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/gptq.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/int8_utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/backend/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/backend/base_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/backend/triton_backend.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/layers.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/mem_pool.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/cache_controller.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/configure_logging.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/expert_distribution.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processor.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/gemma3.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/minicpm.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/qwen_vl.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/paged_allocator.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/clip.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_janus_pro.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_nextn.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_vl2.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma3_mm.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_eagle.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_eagle3.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llava.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/minicpmo.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/minicpmv.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_classification.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_rm.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/platforms/interface.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/reasoning_parser.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/server.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/spec_info.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/torch_memory_saver_adapter.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/warmup.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/attention/__init__.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/runners.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/send_one.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_block_fp8_ep.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_custom_ops.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_dynamic_grad_mode.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_programs.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/utils.py +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.5 → sglang-0.4.5.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.5
|
3
|
+
Version: 0.4.5.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -239,20 +239,30 @@ Requires-Dist: python-multipart; extra == "runtime-common"
|
|
239
239
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
240
240
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
241
241
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
242
|
-
Requires-Dist: transformers==4.51.
|
242
|
+
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
243
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
244
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
245
|
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
246
246
|
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
247
247
|
Provides-Extra: srt
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
249
|
-
Requires-Dist: sgl-kernel==0.0.
|
249
|
+
Requires-Dist: sgl-kernel==0.0.9.post1; extra == "srt"
|
250
250
|
Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
|
251
251
|
Requires-Dist: torch==2.5.1; extra == "srt"
|
252
|
+
Requires-Dist: torchvision==0.20.1; extra == "srt"
|
252
253
|
Requires-Dist: cuda-python; extra == "srt"
|
253
254
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
254
255
|
Requires-Dist: partial_json_parser; extra == "srt"
|
255
256
|
Requires-Dist: einops; extra == "srt"
|
257
|
+
Provides-Extra: blackwell
|
258
|
+
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
259
|
+
Requires-Dist: sgl-kernel; extra == "blackwell"
|
260
|
+
Requires-Dist: torch; extra == "blackwell"
|
261
|
+
Requires-Dist: torchvision; extra == "blackwell"
|
262
|
+
Requires-Dist: cuda-python; extra == "blackwell"
|
263
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
|
264
|
+
Requires-Dist: partial_json_parser; extra == "blackwell"
|
265
|
+
Requires-Dist: einops; extra == "blackwell"
|
256
266
|
Provides-Extra: srt-hip
|
257
267
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
258
268
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -391,7 +401,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
391
401
|
|
392
402
|
## Adoption and Sponsorship
|
393
403
|
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
394
|
-
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
404
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
395
405
|
|
396
406
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
397
407
|
|
@@ -63,7 +63,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
63
63
|
|
64
64
|
## Adoption and Sponsorship
|
65
65
|
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
66
|
-
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
66
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
67
67
|
|
68
68
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
69
69
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.5"
|
7
|
+
version = "0.4.5.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -38,7 +38,7 @@ runtime_common = [
|
|
38
38
|
"pyzmq>=25.1.2",
|
39
39
|
"soundfile==0.13.1",
|
40
40
|
"torchao>=0.7.0",
|
41
|
-
"transformers==4.51.
|
41
|
+
"transformers==4.51.1",
|
42
42
|
"uvicorn",
|
43
43
|
"uvloop",
|
44
44
|
"compressed-tensors",
|
@@ -47,9 +47,21 @@ runtime_common = [
|
|
47
47
|
|
48
48
|
srt = [
|
49
49
|
"sglang[runtime_common]",
|
50
|
-
"sgl-kernel==0.0.
|
50
|
+
"sgl-kernel==0.0.9.post1",
|
51
51
|
"flashinfer_python==0.2.3",
|
52
52
|
"torch==2.5.1",
|
53
|
+
"torchvision==0.20.1",
|
54
|
+
"cuda-python",
|
55
|
+
"outlines>=0.0.44,<=0.1.11",
|
56
|
+
"partial_json_parser",
|
57
|
+
"einops",
|
58
|
+
]
|
59
|
+
|
60
|
+
blackwell = [
|
61
|
+
"sglang[runtime_common]",
|
62
|
+
"sgl-kernel",
|
63
|
+
"torch",
|
64
|
+
"torchvision",
|
53
65
|
"cuda-python",
|
54
66
|
"outlines>=0.0.44,<=0.1.11",
|
55
67
|
"partial_json_parser",
|
@@ -60,6 +60,7 @@ from sglang.srt.configs.model_config import ModelConfig
|
|
60
60
|
from sglang.srt.entrypoints.engine import _set_envs_and_config
|
61
61
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
62
62
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
63
|
+
from sglang.srt.managers.scheduler import Scheduler
|
63
64
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
64
65
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
65
66
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
@@ -135,6 +136,7 @@ def load_model(server_args, port_args, tp_rank):
|
|
135
136
|
context_length=server_args.context_length,
|
136
137
|
model_override_args=server_args.json_model_override_args,
|
137
138
|
is_embedding=server_args.is_embedding,
|
139
|
+
enable_multimodal=server_args.enable_multimodal,
|
138
140
|
dtype=server_args.dtype,
|
139
141
|
quantization=server_args.quantization,
|
140
142
|
)
|
@@ -184,6 +186,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
|
|
184
186
|
req.prefix_indices = []
|
185
187
|
req.fill_ids = req.origin_input_ids
|
186
188
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
189
|
+
req.logprob_start_len = len(req.origin_input_ids) - 1
|
187
190
|
reqs.append(req)
|
188
191
|
|
189
192
|
return input_ids, reqs
|
@@ -199,6 +202,7 @@ def prepare_extend_inputs_for_correctness_test(
|
|
199
202
|
i, : bench_args.cut_len
|
200
203
|
]
|
201
204
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
205
|
+
req.logprob_start_len = len(req.origin_input_ids) - 1
|
202
206
|
return reqs
|
203
207
|
|
204
208
|
|
@@ -220,6 +224,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
220
224
|
req.prefix_indices = []
|
221
225
|
req.fill_ids = req.origin_input_ids
|
222
226
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
227
|
+
req.logprob_start_len = len(req.origin_input_ids) - 1
|
223
228
|
reqs.append(req)
|
224
229
|
|
225
230
|
return reqs
|
@@ -238,6 +243,7 @@ def extend(reqs, model_runner):
|
|
238
243
|
enable_custom_logit_processor=False,
|
239
244
|
)
|
240
245
|
batch.prepare_for_extend()
|
246
|
+
_maybe_prepare_dp_attn_batch(batch, model_runner)
|
241
247
|
model_worker_batch = batch.get_model_worker_batch()
|
242
248
|
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
243
249
|
logits_output = model_runner.forward(forward_batch)
|
@@ -249,6 +255,7 @@ def extend(reqs, model_runner):
|
|
249
255
|
def decode(input_token_ids, batch, model_runner):
|
250
256
|
batch.output_ids = input_token_ids
|
251
257
|
batch.prepare_for_decode()
|
258
|
+
_maybe_prepare_dp_attn_batch(batch, model_runner)
|
252
259
|
model_worker_batch = batch.get_model_worker_batch()
|
253
260
|
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
254
261
|
logits_output = model_runner.forward(forward_batch)
|
@@ -256,6 +263,20 @@ def decode(input_token_ids, batch, model_runner):
|
|
256
263
|
return next_token_ids, logits_output.next_token_logits
|
257
264
|
|
258
265
|
|
266
|
+
def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
|
267
|
+
if model_runner.server_args.enable_dp_attention:
|
268
|
+
Scheduler.prepare_dp_attn_batch_raw(
|
269
|
+
batch,
|
270
|
+
dp_size=model_runner.server_args.dp_size,
|
271
|
+
attn_tp_size=1,
|
272
|
+
tp_cpu_group=model_runner.tp_group.cpu_group,
|
273
|
+
get_idle_batch=None,
|
274
|
+
disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
|
275
|
+
spec_algorithm=SpeculativeAlgorithm.NONE,
|
276
|
+
speculative_num_draft_tokens=None,
|
277
|
+
)
|
278
|
+
|
279
|
+
|
259
280
|
def correctness_test(
|
260
281
|
server_args,
|
261
282
|
port_args,
|
@@ -490,7 +490,7 @@ def get_dataset(args, tokenizer):
|
|
490
490
|
prompt_suffix=args.prompt_suffix,
|
491
491
|
apply_chat_template=args.apply_chat_template,
|
492
492
|
)
|
493
|
-
elif args.dataset_name
|
493
|
+
elif args.dataset_name.startswith("random"):
|
494
494
|
input_requests = sample_random_requests(
|
495
495
|
input_len=args.random_input_len,
|
496
496
|
output_len=args.random_output_len,
|
@@ -498,6 +498,7 @@ def get_dataset(args, tokenizer):
|
|
498
498
|
range_ratio=args.random_range_ratio,
|
499
499
|
tokenizer=tokenizer,
|
500
500
|
dataset_path=args.dataset_path,
|
501
|
+
random_sample=args.dataset_name == "random",
|
501
502
|
)
|
502
503
|
elif args.dataset_name == "generated-shared-prefix":
|
503
504
|
input_requests = sample_generated_shared_prefix_requests(
|
@@ -687,6 +688,7 @@ def sample_random_requests(
|
|
687
688
|
range_ratio: float,
|
688
689
|
tokenizer: PreTrainedTokenizerBase,
|
689
690
|
dataset_path: str,
|
691
|
+
random_sample: bool = True,
|
690
692
|
) -> List[Tuple[str, int, int]]:
|
691
693
|
|
692
694
|
input_lens = np.random.randint(
|
@@ -700,11 +702,15 @@ def sample_random_requests(
|
|
700
702
|
size=num_prompts,
|
701
703
|
)
|
702
704
|
|
703
|
-
if
|
705
|
+
if random_sample:
|
704
706
|
# Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
|
705
707
|
|
706
708
|
# Download sharegpt if necessary
|
707
709
|
if not os.path.isfile(dataset_path):
|
710
|
+
print(
|
711
|
+
"If you do not want to randomly sample from a dataset,"
|
712
|
+
" please use --dataset-name random-ids."
|
713
|
+
)
|
708
714
|
dataset_path = download_and_cache_file(SHAREGPT_URL)
|
709
715
|
|
710
716
|
# Load the dataset.
|
@@ -1223,7 +1229,7 @@ async def benchmark(
|
|
1223
1229
|
output_file_name = args.output_file
|
1224
1230
|
else:
|
1225
1231
|
now = datetime.now().strftime("%m%d")
|
1226
|
-
if args.dataset_name
|
1232
|
+
if args.dataset_name.startswith("random"):
|
1227
1233
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
1228
1234
|
else:
|
1229
1235
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
|
@@ -1442,7 +1448,7 @@ if __name__ == "__main__":
|
|
1442
1448
|
"--dataset-name",
|
1443
1449
|
type=str,
|
1444
1450
|
default="sharegpt",
|
1445
|
-
choices=["sharegpt", "random", "generated-shared-prefix"],
|
1451
|
+
choices=["sharegpt", "random", "random-ids", "generated-shared-prefix"],
|
1446
1452
|
help="Name of the dataset to benchmark on.",
|
1447
1453
|
)
|
1448
1454
|
parser.add_argument(
|
@@ -15,6 +15,7 @@
|
|
15
15
|
import json
|
16
16
|
import logging
|
17
17
|
import math
|
18
|
+
import os
|
18
19
|
from enum import IntEnum, auto
|
19
20
|
from typing import List, Optional, Set, Union
|
20
21
|
|
@@ -42,10 +43,12 @@ class ModelConfig:
|
|
42
43
|
context_length: Optional[int] = None,
|
43
44
|
model_override_args: Optional[str] = None,
|
44
45
|
is_embedding: Optional[bool] = None,
|
46
|
+
enable_multimodal: Optional[bool] = None,
|
45
47
|
dtype: str = "auto",
|
46
48
|
quantization: Optional[str] = None,
|
47
49
|
override_config_file: Optional[str] = None,
|
48
50
|
) -> None:
|
51
|
+
|
49
52
|
self.model_path = model_path
|
50
53
|
self.revision = revision
|
51
54
|
self.quantization = quantization
|
@@ -69,14 +72,28 @@ class ModelConfig:
|
|
69
72
|
self.hf_text_config, "attention_chunk_size", None
|
70
73
|
)
|
71
74
|
|
75
|
+
if enable_multimodal is None:
|
76
|
+
if self.hf_config.architectures == "Llama4ForConditionalGeneration":
|
77
|
+
enable_multimodal = False
|
78
|
+
else:
|
79
|
+
enable_multimodal = True
|
80
|
+
|
72
81
|
# Check model type
|
73
82
|
self.is_generation = is_generation_model(
|
74
83
|
self.hf_config.architectures, is_embedding
|
75
84
|
)
|
76
|
-
self.is_multimodal = is_multimodal_model(
|
77
|
-
|
78
|
-
|
79
|
-
self.
|
85
|
+
self.is_multimodal = enable_multimodal and is_multimodal_model(
|
86
|
+
self.hf_config.architectures
|
87
|
+
)
|
88
|
+
self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
|
89
|
+
self.hf_config.architectures
|
90
|
+
)
|
91
|
+
self.is_image_gen = enable_multimodal and is_image_gen_model(
|
92
|
+
self.hf_config.architectures
|
93
|
+
)
|
94
|
+
self.is_audio_model = enable_multimodal and is_audio_model(
|
95
|
+
self.hf_config.architectures
|
96
|
+
)
|
80
97
|
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
81
98
|
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
82
99
|
|
@@ -234,6 +251,20 @@ class ModelConfig:
|
|
234
251
|
if quant_cfg is None:
|
235
252
|
# compressed-tensors uses a "compression_config" key
|
236
253
|
quant_cfg = getattr(self.hf_config, "compression_config", None)
|
254
|
+
if quant_cfg is None:
|
255
|
+
# check if is modelopt model -- modelopt doesn't have corresponding field
|
256
|
+
# in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
|
257
|
+
# example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
|
258
|
+
is_local = os.path.exists(self.model_path)
|
259
|
+
modelopt_quant_config = {"quant_method": "modelopt"}
|
260
|
+
if not is_local:
|
261
|
+
from huggingface_hub import HfApi
|
262
|
+
|
263
|
+
hf_api = HfApi()
|
264
|
+
if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
|
265
|
+
quant_cfg = modelopt_quant_config
|
266
|
+
elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
|
267
|
+
quant_cfg = modelopt_quant_config
|
237
268
|
return quant_cfg
|
238
269
|
|
239
270
|
# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
|
@@ -264,6 +295,7 @@ class ModelConfig:
|
|
264
295
|
"moe_wna16",
|
265
296
|
]
|
266
297
|
compatible_quantization_methods = {
|
298
|
+
"modelopt_fp4": ["modelopt"],
|
267
299
|
"w8a8_int8": ["compressed-tensors", "compressed_tensors"],
|
268
300
|
"w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
|
269
301
|
}
|
@@ -470,8 +502,8 @@ multimodal_model_archs = [
|
|
470
502
|
"Gemma3ForConditionalGeneration",
|
471
503
|
"Grok1VForCausalLM",
|
472
504
|
"Grok1AForCausalLM",
|
473
|
-
# TODO: add multimodal support for "Llama4ForConditionalGeneration",
|
474
505
|
"LlavaLlamaForCausalLM",
|
506
|
+
"Llama4ForConditionalGeneration",
|
475
507
|
"LlavaMistralForCausalLM",
|
476
508
|
"LlavaQwenForCausalLM",
|
477
509
|
"LlavaVidForCausalLM",
|
@@ -28,6 +28,18 @@ logger = logging.getLogger(__name__)
|
|
28
28
|
|
29
29
|
|
30
30
|
class BaseGrammarObject(ABC):
|
31
|
+
|
32
|
+
def __init__(self):
|
33
|
+
self._finished = False
|
34
|
+
|
35
|
+
@property
|
36
|
+
def finished(self):
|
37
|
+
return self._finished
|
38
|
+
|
39
|
+
@finished.setter
|
40
|
+
def finished(self, finished):
|
41
|
+
self._finished = finished
|
42
|
+
|
31
43
|
@abstractmethod
|
32
44
|
def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
|
33
45
|
"""
|
@@ -59,6 +71,13 @@ class BaseGrammarObject(ABC):
|
|
59
71
|
"""
|
60
72
|
raise NotImplementedError
|
61
73
|
|
74
|
+
@abstractmethod
|
75
|
+
def accept_token(self, token: int) -> None:
|
76
|
+
"""
|
77
|
+
Accept a token in the grammar.
|
78
|
+
"""
|
79
|
+
raise NotImplementedError
|
80
|
+
|
62
81
|
@abstractmethod
|
63
82
|
def allocate_vocab_mask(
|
64
83
|
self, vocab_size: int, batch_size: int, device
|
@@ -90,7 +109,7 @@ class CacheEntry:
|
|
90
109
|
event: Event
|
91
110
|
|
92
111
|
|
93
|
-
class BaseGrammarBackend
|
112
|
+
class BaseGrammarBackend:
|
94
113
|
def __init__(self):
|
95
114
|
self.executor = ThreadPoolExecutor()
|
96
115
|
self.cache: Dict[Tuple[str, str], CacheEntry] = {}
|
@@ -107,19 +126,15 @@ class BaseGrammarBackend(ABC):
|
|
107
126
|
"""
|
108
127
|
raise ValueError(f"Invalid key_type: {key_type}={key_string}")
|
109
128
|
|
110
|
-
@abstractmethod
|
111
129
|
def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]:
|
112
130
|
return self._not_supported("json", key_string)
|
113
131
|
|
114
|
-
@abstractmethod
|
115
132
|
def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]:
|
116
133
|
return self._not_supported("regex", key_string)
|
117
134
|
|
118
|
-
@abstractmethod
|
119
135
|
def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
|
120
136
|
return self._not_supported("ebnf", key_string)
|
121
137
|
|
122
|
-
@abstractmethod
|
123
138
|
def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
|
124
139
|
return self._not_supported("structural_tag", key_string)
|
125
140
|
|
@@ -195,4 +210,10 @@ def create_grammar_backend(
|
|
195
210
|
else:
|
196
211
|
raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
|
197
212
|
|
213
|
+
if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"):
|
214
|
+
from .reasoner_grammar_backend import ReasonerGrammarBackend
|
215
|
+
|
216
|
+
grammar_backend = ReasonerGrammarBackend(
|
217
|
+
grammar_backend, tokenizer.think_end_id
|
218
|
+
)
|
198
219
|
return grammar_backend
|
@@ -33,6 +33,7 @@ class GuidanceGrammar(BaseGrammarObject):
|
|
33
33
|
def __init__(
|
34
34
|
self, llguidance_tokenizer: llguidance.LLTokenizer, serialized_grammar: str
|
35
35
|
):
|
36
|
+
super().__init__()
|
36
37
|
self.llguidance_tokenizer = llguidance_tokenizer
|
37
38
|
self.serialized_grammar = serialized_grammar
|
38
39
|
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
"""The baseclass of a backend for reasoner grammar-guided constrained decoding."""
|
15
|
+
|
16
|
+
from concurrent.futures import Future
|
17
|
+
from typing import List, Optional, Tuple
|
18
|
+
|
19
|
+
import torch
|
20
|
+
|
21
|
+
from .base_grammar_backend import BaseGrammarBackend, BaseGrammarObject
|
22
|
+
|
23
|
+
|
24
|
+
class ReasonerGrammarObject(BaseGrammarObject):
|
25
|
+
def __init__(self, grammar: BaseGrammarObject, think_end_id):
|
26
|
+
super().__init__()
|
27
|
+
self.grammar = grammar
|
28
|
+
self.think_end_id = think_end_id
|
29
|
+
self.is_in_reasoning = True
|
30
|
+
|
31
|
+
@property
|
32
|
+
def finished(self):
|
33
|
+
return self.grammar.finished
|
34
|
+
|
35
|
+
@finished.setter
|
36
|
+
def finished(self, finished):
|
37
|
+
self.grammar.finished = finished
|
38
|
+
|
39
|
+
def allocate_vocab_mask(
|
40
|
+
self, vocab_size: int, batch_size: int, device
|
41
|
+
) -> torch.Tensor:
|
42
|
+
return self.grammar.allocate_vocab_mask(vocab_size, batch_size, device)
|
43
|
+
|
44
|
+
def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
|
45
|
+
if not self.is_in_reasoning:
|
46
|
+
self.grammar.fill_vocab_mask(vocab_mask, idx)
|
47
|
+
|
48
|
+
def move_vocab_mask(self, vocab_mask: torch.Tensor, device) -> torch.Tensor:
|
49
|
+
return self.grammar.move_vocab_mask(vocab_mask, device)
|
50
|
+
|
51
|
+
@property
|
52
|
+
def apply_vocab_mask(self):
|
53
|
+
return self.grammar.apply_vocab_mask
|
54
|
+
|
55
|
+
def accept_token(self, token: int):
|
56
|
+
if token == self.think_end_id:
|
57
|
+
self.is_in_reasoning = False
|
58
|
+
|
59
|
+
if not self.is_in_reasoning and token != self.think_end_id:
|
60
|
+
self.grammar.accept_token(token)
|
61
|
+
|
62
|
+
def try_jump_forward(self, tokenizer):
|
63
|
+
return self.grammar.try_jump_forward(tokenizer)
|
64
|
+
|
65
|
+
def jump_forward_str_state(self, helper):
|
66
|
+
return self.grammar.jump_forward_str_state(helper)
|
67
|
+
|
68
|
+
def jump_and_retokenize(
|
69
|
+
self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
|
70
|
+
):
|
71
|
+
return self.grammar.jump_and_retokenize(
|
72
|
+
old_output_ids, new_output_ids, next_state
|
73
|
+
)
|
74
|
+
|
75
|
+
def copy(self) -> BaseGrammarObject:
|
76
|
+
return ReasonerGrammarObject(self.grammar.copy(), self.think_end_id)
|
77
|
+
|
78
|
+
|
79
|
+
class ReasonerGrammarBackend(BaseGrammarBackend):
|
80
|
+
def __init__(self, grammar_backend: BaseGrammarBackend, think_end_id):
|
81
|
+
self.grammar_backend = grammar_backend
|
82
|
+
self.think_end_id = think_end_id
|
83
|
+
|
84
|
+
def get_cached_value(self, key: Tuple[str, str]) -> Optional[ReasonerGrammarObject]:
|
85
|
+
grammar = self.grammar_backend.get_cached_value(key)
|
86
|
+
return ReasonerGrammarObject(grammar, self.think_end_id) if grammar else None
|
87
|
+
|
88
|
+
def get_future_value(self, key: Tuple[str, str]) -> Future:
|
89
|
+
grammar = Future()
|
90
|
+
|
91
|
+
def callback(f: Future):
|
92
|
+
if result := f.result():
|
93
|
+
grammar.set_result(ReasonerGrammarObject(result, self.think_end_id))
|
94
|
+
else:
|
95
|
+
grammar.set_result(None)
|
96
|
+
|
97
|
+
self.grammar_backend.get_future_value(key).add_done_callback(callback)
|
98
|
+
return grammar
|
99
|
+
|
100
|
+
def reset(self):
|
101
|
+
self.grammar_backend.reset()
|
@@ -0,0 +1,113 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
import numpy.typing as npt
|
6
|
+
|
7
|
+
from sglang.srt.disaggregation.utils import DisaggregationMode
|
8
|
+
from sglang.srt.server_args import ServerArgs
|
9
|
+
|
10
|
+
|
11
|
+
class KVArgs:
|
12
|
+
engine_rank: int
|
13
|
+
kv_data_ptrs: list[int]
|
14
|
+
kv_data_lens: list[int]
|
15
|
+
kv_item_lens: list[int]
|
16
|
+
aux_data_ptrs: list[int]
|
17
|
+
aux_data_lens: list[int]
|
18
|
+
aux_item_lens: list[int]
|
19
|
+
ib_device: str
|
20
|
+
gpu_id: int
|
21
|
+
|
22
|
+
|
23
|
+
class KVPoll:
|
24
|
+
Failed = 0
|
25
|
+
Bootstrapping = 1
|
26
|
+
WaitingForInput = 2
|
27
|
+
Transferring = 3
|
28
|
+
Success = 4
|
29
|
+
|
30
|
+
|
31
|
+
class BaseKVManager(ABC):
|
32
|
+
"""Base class for managing transfers states"""
|
33
|
+
|
34
|
+
@abstractmethod
|
35
|
+
def __init__(
|
36
|
+
self,
|
37
|
+
args: KVArgs,
|
38
|
+
disaggregation_mode: DisaggregationMode,
|
39
|
+
server_args: ServerArgs,
|
40
|
+
): ...
|
41
|
+
|
42
|
+
|
43
|
+
class BaseKVSender(ABC):
|
44
|
+
|
45
|
+
@abstractmethod
|
46
|
+
def __init__(
|
47
|
+
self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int
|
48
|
+
): ...
|
49
|
+
|
50
|
+
@abstractmethod
|
51
|
+
def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
|
52
|
+
"""
|
53
|
+
Notify the decoder server about the kv indices length and aux index
|
54
|
+
"""
|
55
|
+
...
|
56
|
+
|
57
|
+
@abstractmethod
|
58
|
+
def send(self, kv_indices: npt.NDArray[np.int64]):
|
59
|
+
"""
|
60
|
+
Send the kv cache at the given kv indices to the decoder server
|
61
|
+
"""
|
62
|
+
...
|
63
|
+
|
64
|
+
@abstractmethod
|
65
|
+
def poll(self) -> KVPoll:
|
66
|
+
"""
|
67
|
+
Check the status of the kv cache transfer
|
68
|
+
"""
|
69
|
+
...
|
70
|
+
|
71
|
+
@abstractmethod
|
72
|
+
def failure_exception(self):
|
73
|
+
"""
|
74
|
+
Raise an exception if the kv cache transfer fails
|
75
|
+
"""
|
76
|
+
...
|
77
|
+
|
78
|
+
|
79
|
+
class BaseKVReceiver(ABC):
|
80
|
+
|
81
|
+
@abstractmethod
|
82
|
+
def __init__(
|
83
|
+
self,
|
84
|
+
mgr: BaseKVManager,
|
85
|
+
bootstrap_addr: str,
|
86
|
+
bootstrap_room: Optional[int] = None,
|
87
|
+
): ...
|
88
|
+
|
89
|
+
@abstractmethod
|
90
|
+
def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
|
91
|
+
"""
|
92
|
+
Notify the prefill server about the kv indices and aux index
|
93
|
+
"""
|
94
|
+
...
|
95
|
+
|
96
|
+
@abstractmethod
|
97
|
+
def poll(self) -> KVPoll:
|
98
|
+
"""
|
99
|
+
Check the status of the kv cache transfer
|
100
|
+
"""
|
101
|
+
...
|
102
|
+
|
103
|
+
@abstractmethod
|
104
|
+
def failure_exception(self):
|
105
|
+
"""
|
106
|
+
Raise an exception if the kv cache transfer fails
|
107
|
+
"""
|
108
|
+
...
|
109
|
+
|
110
|
+
|
111
|
+
class BaseKVBootstrapServer(ABC):
|
112
|
+
@abstractmethod
|
113
|
+
def __init__(self, port: int): ...
|