sglang 0.5.1.post2__tar.gz → 0.5.2rc0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.5.1.post2/sglang.egg-info → sglang-0.5.2rc0}/PKG-INFO +7 -6
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/README.md +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/pyproject.toml +6 -5
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/bench_one_batch.py +3 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/bench_one_batch_server.py +79 -53
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/bench_serving.py +186 -14
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/profiler.py +0 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/__init__.py +2 -0
- sglang-0.5.2rc0/sglang/srt/configs/longcat_flash.py +104 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/model_config.py +12 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/connector/__init__.py +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/connector/base_connector.py +1 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/connector/redis.py +2 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/connector/serde/__init__.py +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/connector/serde/safe_serde.py +4 -3
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/conversation.py +38 -5
- sglang-0.5.2rc0/sglang/srt/disaggregation/ascend/conn.py +117 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/launch_lb.py +0 -13
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/mini_lb.py +33 -8
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/prefill.py +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/parallel_state.py +24 -14
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/engine.py +19 -12
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/http_server.py +174 -34
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/protocol.py +87 -24
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/eplb_manager.py +26 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/expert_distribution.py +29 -2
- sglang-0.5.2rc0/sglang/srt/function_call/deepseekv31_detector.py +222 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/function_call_parser.py +2 -0
- sglang-0.5.2rc0/sglang/srt/function_call/gpt_oss_detector.py +219 -0
- sglang-0.5.2rc0/sglang/srt/harmony_parser.py +588 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/hf_transformers_utils.py +26 -7
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/activation.py +12 -0
- sglang-0.5.2rc0/sglang/srt/layers/attention/ascend_backend.py +570 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/flashattention_backend.py +241 -7
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/communicator.py +1 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/layernorm.py +28 -3
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/linear.py +3 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/logits_processor.py +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/cutlass_moe.py +0 -8
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/ep_moe/layer.py +13 -13
- sglang-0.5.2rc0/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang-0.5.2rc0/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/topk.py +35 -12
- sglang-0.5.2rc0/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +233 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/fp8.py +2 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/fp8_utils.py +2 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/modelopt_quant.py +7 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/mxfp4.py +25 -27
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/utils.py +13 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/w8a8_int8.py +7 -3
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/rotary_embedding.py +28 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/sampler.py +29 -5
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/utils.py +0 -14
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/cache_controller.py +237 -204
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/detokenizer_manager.py +48 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/io_struct.py +57 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/mm_utils.py +5 -1
- sglang-0.5.2rc0/sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/scheduler.py +94 -9
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/tokenizer_manager.py +122 -42
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/chunk_cache.py +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/hicache_storage.py +51 -23
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/hiradix_cache.py +87 -71
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/memory_pool.py +77 -14
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/memory_pool_host.py +4 -5
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/radix_cache.py +6 -4
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/swa_radix_cache.py +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/model_executor/model_runner.py +6 -5
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/model_loader/loader.py +15 -24
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/model_loader/utils.py +12 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/deepseek_v2.py +38 -13
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gpt_oss.py +2 -15
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/llama_eagle3.py +4 -0
- sglang-0.5.2rc0/sglang/srt/models/longcat_flash.py +1015 -0
- sglang-0.5.2rc0/sglang/srt/models/longcat_flash_nextn.py +691 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen2.py +26 -3
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen2_5_vl.py +66 -41
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen2_moe.py +22 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/transformers.py +1 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang-0.5.2rc0/sglang/srt/reasoning_parser.py +309 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/server_args.py +122 -56
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/speculative/eagle_worker.py +28 -8
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/utils.py +73 -5
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/attention/test_trtllm_mla_backend.py +12 -3
- sglang-0.5.2rc0/sglang/version.py +1 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0/sglang.egg-info}/PKG-INFO +7 -6
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang.egg-info/SOURCES.txt +8 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang.egg-info/requires.txt +5 -4
- sglang-0.5.1.post2/sglang/srt/disaggregation/ascend/conn.py +0 -42
- sglang-0.5.1.post2/sglang/srt/function_call/gpt_oss_detector.py +0 -331
- sglang-0.5.1.post2/sglang/srt/layers/attention/ascend_backend.py +0 -332
- sglang-0.5.1.post2/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +0 -335
- sglang-0.5.1.post2/sglang/srt/reasoning_parser.py +0 -553
- sglang-0.5.1.post2/sglang/version.py +0 -1
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/LICENSE +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/setup.cfg +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/bench_offline_throughput.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/check_env.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/compile_deep_gemm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/eval/llama3_eval.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/eval/loogle_eval.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/global_config.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/api.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/chat_template.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/choices.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/compiler.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/interpreter.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/ir.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/lang/tracer.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/launch_server.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/_custom_ops.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/bench_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/code_completion_parser.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/chatglm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/dbrx.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/deepseekvl2.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/internvl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/janus_pro.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/kimi_vl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/kimi_vl_moonvit.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/step3_vl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/update_config.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/configs/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/connector/s3.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/connector/serde/serde.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/connector/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/constants.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/constrained/llguidance_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/constrained/outlines_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/constrained/xgrammar_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/custom_op.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/debug_utils/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/debug_utils/dump_comparator.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/debug_utils/dumper.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/debug_utils/text_comparator.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/ascend/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/ascend/transfer_engine.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/base/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/base/conn.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/common/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/common/conn.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/common/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/decode.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/decode_schedule_batch_mixin.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/fake/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/fake/conn.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/kv_events.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/mooncake/conn.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/nixl/conn.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/disaggregation/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/npu_communicator.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/pymscclpp.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/pynccl_allocator.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/quick_all_reduce.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/naive_distributed.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/EngineBase.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/context.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/harmony_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/http_server_engine.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/serving_base.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/serving_embedding.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/serving_rerank.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/serving_responses.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/serving_score.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/tool_server.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/usage_processor.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/openai/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/entrypoints/tool.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/eplb_algorithms/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/eplb_simulator/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/eplb_simulator/reader.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/expert_location.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/expert_location_dispatch.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/eplb/expert_location_updater.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/base_format_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/core_types.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/deepseekv3_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/ebnf_composer.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/glm4_moe_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/kimik2_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/llama32_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/mistral_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/pythonic_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/qwen25_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/qwen3_coder_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/step3_detector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/function_call/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/host_shared_memory.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/jinja_template_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/amx_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/aiter_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/cutlass_mla_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/intel_amx_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/merge_state.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/tbo_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/triton_ops/merge_state.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/trtllm_mha_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/vision.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/vision_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/wave_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/wave_ops/decode_attention.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/wave_ops/extend_attention.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/attention/wave_ops/prefill_attention.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/dp_attention.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/elementwise.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/flashinfer_comm_fusion.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/cutlass_moe_params.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/cutlass_w4a8_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/layer.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/moe_runner/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/moe_runner/base.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/rocm_moe_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/router.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/token_dispatcher/deepep.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/token_dispatcher/standard.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/moe/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/multimodal.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/parameter.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/awq.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/awq_triton.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/fpgemm_fp8.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/gptq.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/int8_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/kv_cache.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/marlin_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/marlin_utils_fp8.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/petit.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/petit_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/qoq.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/quark/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/quark/quark.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/quark/quark_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/quark/schemes/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/quark/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/unquant.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/w4afp8.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/backend/base_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/backend/triton_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/layers.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/lora_registry.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/mem_pool.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/triton_ops/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/lora/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/configure_logging.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/multimodal_processor.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/schedule_batch.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/scheduler_input_blocker.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/scheduler_metrics_mixin.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/scheduler_profiler_mixin.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/scheduler_recv_skipper.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/template_manager.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/managers/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/allocator.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/allocator_ascend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/multimodal_cache.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/nixl/nixl_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/model_executor/forward_batch_info.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/model_executor/npu_graph_runner.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/model_loader/weight_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/arcee.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/bailing_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/bert.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/clip.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/deepseek_janus_pro.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/deepseek_nextn.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/deepseek_vl2.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/ernie4.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/ernie4_eagle.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gemma3_causal.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gemma3_mm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gemma3n_audio.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gemma3n_causal.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gemma3n_mm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/glm4.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/glm4_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/glm4_moe_nextn.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/glm4v.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/glm4v_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/granite.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/granitemoe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/grok.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/hunyuan.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/idefics2.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/interns1.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/internvl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/kimi_vl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/kimi_vl_moonvit.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/llama.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/llama4.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/llama_eagle.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/llava.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/mimo.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/mimo_mtp.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/minicpmo.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/minicpmv.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/mllama4.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/nemotron_nas.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/olmo2.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/persimmon.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/phi.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/phi3_small.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/phi4mm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/phi4mm_audio.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/phi4mm_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/phimoe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/pixtral.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen2_audio.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen2_classification.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen2_eagle.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen2_rm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen2_vl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen3.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen3_classification.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/qwen3_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/registry.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/roberta.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/siglip.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/step3_vl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/vila.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/mm_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/clip.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/gemma3.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/gemma3n.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/glm4v.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/internvl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/janus_pro.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/kimi_vl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/llava.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/minicpm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/mlama.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/mllama4.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/phi4mm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/pixtral.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/qwen_audio.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/qwen_vl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/step3_vl.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/multimodal/processors/vila.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/offloader.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/operations.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/operations_strategy.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/patch_torch.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/poll_based_barrier.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/sampling/custom_logit_processor.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/speculative/build_eagle_tree.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/speculative/eagle_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/speculative/spec_info.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/torch_memory_saver_adapter.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/two_batch_overlap.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/warmup.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/weight_sync/tensor_bucket.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/srt/weight_sync/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/attention/__init__.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/attention/test_flashattn_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/doc_patch.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/run_eval.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/runners.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/send_one.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_activation.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_block_fp8.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_block_fp8_ep.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_custom_ops.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_cutlass_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_cutlass_w4a8_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_deepep_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_dynamic_grad_mode.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_fp4_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_marlin_moe.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_marlin_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_programs.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/test/test_utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang/utils.py +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.5.1.post2 → sglang-0.5.2rc0}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.2rc0
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -251,18 +251,18 @@ Requires-Dist: scipy; extra == "runtime-common"
|
|
251
251
|
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
252
252
|
Requires-Dist: tiktoken; extra == "runtime-common"
|
253
253
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
254
|
-
Requires-Dist: transformers==4.
|
254
|
+
Requires-Dist: transformers==4.56.0; extra == "runtime-common"
|
255
255
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
256
256
|
Requires-Dist: uvloop; extra == "runtime-common"
|
257
257
|
Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
|
258
258
|
Provides-Extra: srt
|
259
259
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
260
|
-
Requires-Dist: sgl-kernel==0.3.
|
260
|
+
Requires-Dist: sgl-kernel==0.3.7.post1; extra == "srt"
|
261
261
|
Requires-Dist: torch==2.8.0; extra == "srt"
|
262
262
|
Requires-Dist: torchaudio==2.8.0; extra == "srt"
|
263
263
|
Requires-Dist: torchvision; extra == "srt"
|
264
264
|
Requires-Dist: cuda-python; extra == "srt"
|
265
|
-
Requires-Dist: flashinfer_python==0.
|
265
|
+
Requires-Dist: flashinfer_python==0.3.0; extra == "srt"
|
266
266
|
Provides-Extra: blackwell
|
267
267
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
268
268
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
|
|
270
270
|
Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
|
271
271
|
Requires-Dist: torchvision; extra == "blackwell"
|
272
272
|
Requires-Dist: cuda-python; extra == "blackwell"
|
273
|
-
Requires-Dist: flashinfer_python==0.
|
273
|
+
Requires-Dist: flashinfer_python==0.3.0; extra == "blackwell"
|
274
274
|
Provides-Extra: srt-hip
|
275
275
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
276
276
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -304,6 +304,7 @@ Requires-Dist: pandas; extra == "test"
|
|
304
304
|
Requires-Dist: peft; extra == "test"
|
305
305
|
Requires-Dist: sentence_transformers; extra == "test"
|
306
306
|
Requires-Dist: pytest; extra == "test"
|
307
|
+
Requires-Dist: tabulate; extra == "test"
|
307
308
|
Provides-Extra: all
|
308
309
|
Requires-Dist: sglang[srt]; extra == "all"
|
309
310
|
Requires-Dist: sglang[openai]; extra == "all"
|
@@ -374,7 +375,7 @@ Dynamic: license-file
|
|
374
375
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
375
376
|
|
376
377
|
## News
|
377
|
-
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
|
378
|
+
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
|
378
379
|
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
379
380
|
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
380
381
|
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|
@@ -20,7 +20,7 @@
|
|
20
20
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
21
21
|
|
22
22
|
## News
|
23
|
-
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
|
23
|
+
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
|
24
24
|
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
25
25
|
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
26
26
|
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.5.
|
7
|
+
version = "0.5.2rc0"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.10"
|
@@ -50,7 +50,7 @@ runtime_common = [
|
|
50
50
|
"timm==1.0.16",
|
51
51
|
"tiktoken",
|
52
52
|
"torchao==0.9.0",
|
53
|
-
"transformers==4.
|
53
|
+
"transformers==4.56.0",
|
54
54
|
"uvicorn",
|
55
55
|
"uvloop",
|
56
56
|
"xgrammar==0.1.23",
|
@@ -58,12 +58,12 @@ runtime_common = [
|
|
58
58
|
|
59
59
|
srt = [
|
60
60
|
"sglang[runtime_common]",
|
61
|
-
"sgl-kernel==0.3.
|
61
|
+
"sgl-kernel==0.3.7.post1",
|
62
62
|
"torch==2.8.0",
|
63
63
|
"torchaudio==2.8.0",
|
64
64
|
"torchvision",
|
65
65
|
"cuda-python",
|
66
|
-
"flashinfer_python==0.
|
66
|
+
"flashinfer_python==0.3.0",
|
67
67
|
]
|
68
68
|
|
69
69
|
blackwell = [
|
@@ -73,7 +73,7 @@ blackwell = [
|
|
73
73
|
"torchaudio==2.8.0",
|
74
74
|
"torchvision",
|
75
75
|
"cuda-python",
|
76
|
-
"flashinfer_python==0.
|
76
|
+
"flashinfer_python==0.3.0",
|
77
77
|
]
|
78
78
|
|
79
79
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
@@ -113,6 +113,7 @@ test = [
|
|
113
113
|
"peft",
|
114
114
|
"sentence_transformers",
|
115
115
|
"pytest",
|
116
|
+
"tabulate",
|
116
117
|
]
|
117
118
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[torch_memory_saver]", "sglang[decord]"]
|
118
119
|
all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
|
@@ -61,6 +61,7 @@ from sglang.srt.configs.model_config import ModelConfig
|
|
61
61
|
from sglang.srt.distributed.parallel_state import destroy_distributed_environment
|
62
62
|
from sglang.srt.entrypoints.engine import _set_envs_and_config
|
63
63
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
64
|
+
from sglang.srt.layers.moe import initialize_moe_config
|
64
65
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
65
66
|
from sglang.srt.managers.scheduler import Scheduler
|
66
67
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
@@ -509,6 +510,8 @@ def latency_test(
|
|
509
510
|
bench_args,
|
510
511
|
tp_rank,
|
511
512
|
):
|
513
|
+
initialize_moe_config(server_args)
|
514
|
+
|
512
515
|
# Set CPU affinity
|
513
516
|
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
514
517
|
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
|
@@ -18,7 +18,7 @@ import json
|
|
18
18
|
import multiprocessing
|
19
19
|
import os
|
20
20
|
import time
|
21
|
-
from typing import Tuple
|
21
|
+
from typing import List, Tuple
|
22
22
|
|
23
23
|
import requests
|
24
24
|
|
@@ -45,6 +45,7 @@ class BenchArgs:
|
|
45
45
|
skip_warmup: bool = False
|
46
46
|
show_report: bool = False
|
47
47
|
profile: bool = False
|
48
|
+
profile_steps: int = 3
|
48
49
|
profile_by_stage: bool = False
|
49
50
|
|
50
51
|
@staticmethod
|
@@ -78,6 +79,9 @@ class BenchArgs:
|
|
78
79
|
parser.add_argument("--skip-warmup", action="store_true")
|
79
80
|
parser.add_argument("--show-report", action="store_true")
|
80
81
|
parser.add_argument("--profile", action="store_true")
|
82
|
+
parser.add_argument(
|
83
|
+
"--profile-steps", type=int, default=BenchArgs.profile_steps
|
84
|
+
)
|
81
85
|
parser.add_argument("--profile-by-stage", action="store_true")
|
82
86
|
|
83
87
|
@classmethod
|
@@ -132,6 +136,7 @@ def run_one_case(
|
|
132
136
|
result_filename: str,
|
133
137
|
tokenizer,
|
134
138
|
profile: bool = False,
|
139
|
+
profile_steps: int = 3,
|
135
140
|
profile_by_stage: bool = False,
|
136
141
|
):
|
137
142
|
requests.post(url + "/flush_cache")
|
@@ -162,7 +167,7 @@ def run_one_case(
|
|
162
167
|
profile_link = None
|
163
168
|
if profile:
|
164
169
|
profile_link: str = run_profile(
|
165
|
-
url,
|
170
|
+
url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
|
166
171
|
)
|
167
172
|
|
168
173
|
tic = time.perf_counter()
|
@@ -247,6 +252,71 @@ def run_one_case(
|
|
247
252
|
)
|
248
253
|
|
249
254
|
|
255
|
+
def get_report_summary(
|
256
|
+
result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
|
257
|
+
):
|
258
|
+
import tabulate
|
259
|
+
|
260
|
+
summary = (
|
261
|
+
f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
|
262
|
+
)
|
263
|
+
|
264
|
+
headers = [
|
265
|
+
"batch size",
|
266
|
+
"latency (s)",
|
267
|
+
"input throughput (tok/s)",
|
268
|
+
"output throughput (tok/s)",
|
269
|
+
"acc length",
|
270
|
+
"ITL (ms)",
|
271
|
+
"input cost ($/1M)",
|
272
|
+
"output cost ($/1M)",
|
273
|
+
]
|
274
|
+
if bench_args.profile:
|
275
|
+
headers.append("profile")
|
276
|
+
rows = []
|
277
|
+
|
278
|
+
for (
|
279
|
+
batch_size,
|
280
|
+
latency,
|
281
|
+
ttft,
|
282
|
+
input_throughput,
|
283
|
+
output_throughput,
|
284
|
+
_,
|
285
|
+
_,
|
286
|
+
acc_length,
|
287
|
+
trace_link,
|
288
|
+
) in result:
|
289
|
+
if is_blackwell():
|
290
|
+
hourly_cost_per_gpu = 4 # $4/hour for one B200
|
291
|
+
else:
|
292
|
+
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
293
|
+
|
294
|
+
hourly_cost = hourly_cost_per_gpu * server_args.tp_size
|
295
|
+
input_util = 0.7
|
296
|
+
accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
|
297
|
+
itl = 1 / (output_throughput / batch_size) * 1000
|
298
|
+
input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
|
299
|
+
output_cost = 1e6 / output_throughput / 3600 * hourly_cost
|
300
|
+
row = [
|
301
|
+
batch_size,
|
302
|
+
latency,
|
303
|
+
input_throughput,
|
304
|
+
output_throughput,
|
305
|
+
accept_length,
|
306
|
+
itl,
|
307
|
+
input_cost,
|
308
|
+
output_cost,
|
309
|
+
]
|
310
|
+
if trace_link:
|
311
|
+
row.append(f"[Profile]({trace_link})")
|
312
|
+
rows.append(row)
|
313
|
+
|
314
|
+
summary += tabulate.tabulate(
|
315
|
+
rows, headers=headers, tablefmt="github", floatfmt=".2f"
|
316
|
+
)
|
317
|
+
return summary
|
318
|
+
|
319
|
+
|
250
320
|
def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
251
321
|
if bench_args.base_url:
|
252
322
|
proc, base_url = None, bench_args.base_url
|
@@ -321,6 +391,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
321
391
|
result_filename=bench_args.result_filename,
|
322
392
|
tokenizer=tokenizer,
|
323
393
|
profile=bench_args.profile,
|
394
|
+
profile_steps=bench_args.profile_steps,
|
324
395
|
profile_by_stage=bench_args.profile_by_stage,
|
325
396
|
)[-1],
|
326
397
|
)
|
@@ -337,63 +408,14 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
337
408
|
if not bench_args.show_report:
|
338
409
|
return
|
339
410
|
|
340
|
-
summary = (
|
341
|
-
f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
|
342
|
-
)
|
343
|
-
summary += "| batch size | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
|
344
|
-
|
345
|
-
if bench_args.profile:
|
346
|
-
summary += " profile |"
|
347
|
-
|
348
|
-
summary += "\n"
|
349
|
-
summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
|
350
|
-
|
351
|
-
if bench_args.profile:
|
352
|
-
summary += "-------------|"
|
353
|
-
summary += "\n"
|
354
|
-
|
355
|
-
for (
|
356
|
-
batch_size,
|
357
|
-
latency,
|
358
|
-
ttft,
|
359
|
-
input_throughput,
|
360
|
-
output_throughput,
|
361
|
-
overall_throughput,
|
362
|
-
last_gen_throughput,
|
363
|
-
acc_length,
|
364
|
-
trace_link,
|
365
|
-
) in result:
|
366
|
-
if is_blackwell():
|
367
|
-
hourly_cost_per_gpu = 4 # $4/hour for one B200
|
368
|
-
else:
|
369
|
-
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
370
|
-
|
371
|
-
hourly_cost = hourly_cost_per_gpu * server_args.tp_size
|
372
|
-
input_util = 0.7
|
373
|
-
accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
|
374
|
-
line = (
|
375
|
-
f"| {batch_size} | "
|
376
|
-
f"{latency:.2f} | "
|
377
|
-
f"{input_throughput:.2f} | "
|
378
|
-
f"{output_throughput:.2f} | "
|
379
|
-
f"{accept_length} | "
|
380
|
-
f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
|
381
|
-
f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
|
382
|
-
f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
|
383
|
-
)
|
384
|
-
if trace_link:
|
385
|
-
line += f" [Profile]({trace_link}) |"
|
386
|
-
line += "\n"
|
387
|
-
summary += line
|
388
|
-
|
389
|
-
# print metrics table
|
411
|
+
summary = get_report_summary(result, server_args, bench_args)
|
390
412
|
print(summary)
|
391
413
|
|
392
414
|
if is_in_ci():
|
393
415
|
write_github_step_summary(summary)
|
394
416
|
|
395
417
|
|
396
|
-
|
418
|
+
def main():
|
397
419
|
parser = argparse.ArgumentParser()
|
398
420
|
ServerArgs.add_cli_args(parser)
|
399
421
|
BenchArgs.add_cli_args(parser)
|
@@ -402,3 +424,7 @@ if __name__ == "__main__":
|
|
402
424
|
bench_args = BenchArgs.from_cli_args(args)
|
403
425
|
|
404
426
|
run_benchmark(server_args, bench_args)
|
427
|
+
|
428
|
+
|
429
|
+
if __name__ == "__main__":
|
430
|
+
main()
|
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
|
|
12
12
|
|
13
13
|
import argparse
|
14
14
|
import asyncio
|
15
|
+
import base64
|
16
|
+
import io
|
15
17
|
import json
|
16
18
|
import os
|
17
19
|
import pickle
|
@@ -71,7 +73,7 @@ class RequestFuncInput:
|
|
71
73
|
output_len: int
|
72
74
|
model: str
|
73
75
|
lora_name: str
|
74
|
-
image_data: str
|
76
|
+
image_data: Optional[List[str]]
|
75
77
|
extra_request_body: Dict[str, Any]
|
76
78
|
|
77
79
|
|
@@ -289,16 +291,19 @@ async def async_request_openai_chat_completions(
|
|
289
291
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
290
292
|
|
291
293
|
if request_func_input.image_data:
|
294
|
+
# Build multi-image content: a list of image_url entries followed by the text
|
295
|
+
content_items = [
|
296
|
+
{
|
297
|
+
"type": "image_url",
|
298
|
+
"image_url": {"url": img_url},
|
299
|
+
}
|
300
|
+
for img_url in request_func_input.image_data
|
301
|
+
]
|
302
|
+
content_items.append({"type": "text", "text": request_func_input.prompt})
|
292
303
|
messages = [
|
293
304
|
{
|
294
305
|
"role": "user",
|
295
|
-
"content":
|
296
|
-
{
|
297
|
-
"type": "image_url",
|
298
|
-
"image_url": {"url": request_func_input.image_data},
|
299
|
-
},
|
300
|
-
{"type": "text", "text": request_func_input.prompt},
|
301
|
-
],
|
306
|
+
"content": content_items,
|
302
307
|
},
|
303
308
|
]
|
304
309
|
else:
|
@@ -497,7 +502,7 @@ async def async_request_sglang_generate(
|
|
497
502
|
**request_func_input.extra_request_body,
|
498
503
|
}
|
499
504
|
|
500
|
-
# Add image data if available
|
505
|
+
# Add image data if available (list of image urls/base64)
|
501
506
|
if request_func_input.image_data:
|
502
507
|
payload["image_data"] = request_func_input.image_data
|
503
508
|
|
@@ -648,7 +653,7 @@ def get_dataset(args, tokenizer):
|
|
648
653
|
prompt_suffix=args.prompt_suffix,
|
649
654
|
apply_chat_template=args.apply_chat_template,
|
650
655
|
)
|
651
|
-
elif args.dataset_name.startswith("random"):
|
656
|
+
elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
|
652
657
|
input_requests = sample_random_requests(
|
653
658
|
input_len=args.random_input_len,
|
654
659
|
output_len=args.random_output_len,
|
@@ -659,6 +664,18 @@ def get_dataset(args, tokenizer):
|
|
659
664
|
random_sample=args.dataset_name == "random",
|
660
665
|
return_text=not tokenize_prompt,
|
661
666
|
)
|
667
|
+
elif args.dataset_name == "random-image":
|
668
|
+
assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
|
669
|
+
input_requests = sample_random_image_requests(
|
670
|
+
num_requests=args.num_prompts,
|
671
|
+
num_images=args.random_image_num_images,
|
672
|
+
input_len=args.random_input_len,
|
673
|
+
output_len=args.random_output_len,
|
674
|
+
range_ratio=args.random_range_ratio,
|
675
|
+
tokenizer=tokenizer,
|
676
|
+
apply_chat_template=args.apply_chat_template,
|
677
|
+
image_resolution=args.random_image_resolution,
|
678
|
+
)
|
662
679
|
elif args.dataset_name == "generated-shared-prefix":
|
663
680
|
assert not tokenize_prompt
|
664
681
|
input_requests = sample_generated_shared_prefix_requests(
|
@@ -790,7 +807,7 @@ class DatasetRow:
|
|
790
807
|
prompt: str
|
791
808
|
prompt_len: int
|
792
809
|
output_len: int
|
793
|
-
image_data: Optional[str] = None
|
810
|
+
image_data: Optional[List[str]] = None
|
794
811
|
|
795
812
|
|
796
813
|
def sample_mmmu_requests(
|
@@ -913,7 +930,7 @@ def sample_mmmu_requests(
|
|
913
930
|
prompt=prompt,
|
914
931
|
prompt_len=prompt_len,
|
915
932
|
output_len=output_len,
|
916
|
-
image_data=image_data,
|
933
|
+
image_data=[image_data],
|
917
934
|
)
|
918
935
|
)
|
919
936
|
|
@@ -1113,6 +1130,132 @@ def sample_random_requests(
|
|
1113
1130
|
return input_requests
|
1114
1131
|
|
1115
1132
|
|
1133
|
+
def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
1134
|
+
"""Parse image resolution into (width, height).
|
1135
|
+
|
1136
|
+
Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
|
1137
|
+
(e.g., '1080x1920' means height=1080, width=1920).
|
1138
|
+
"""
|
1139
|
+
resolution_to_size = {
|
1140
|
+
"4k": (3840, 2160),
|
1141
|
+
"1080p": (1920, 1080),
|
1142
|
+
"720p": (1280, 720),
|
1143
|
+
"360p": (640, 360),
|
1144
|
+
}
|
1145
|
+
if image_resolution in resolution_to_size:
|
1146
|
+
return resolution_to_size[image_resolution]
|
1147
|
+
|
1148
|
+
res = image_resolution.strip().lower()
|
1149
|
+
if "x" in res:
|
1150
|
+
parts = res.split("x")
|
1151
|
+
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
|
1152
|
+
height = int(parts[0])
|
1153
|
+
width = int(parts[1])
|
1154
|
+
if height > 0 and width > 0:
|
1155
|
+
return (width, height)
|
1156
|
+
|
1157
|
+
raise ValueError(
|
1158
|
+
f"Unsupported random-image resolution: {image_resolution}. "
|
1159
|
+
"Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
|
1160
|
+
)
|
1161
|
+
|
1162
|
+
|
1163
|
+
def sample_random_image_requests(
|
1164
|
+
num_requests: int,
|
1165
|
+
num_images: int,
|
1166
|
+
input_len: int,
|
1167
|
+
output_len: int,
|
1168
|
+
range_ratio: float,
|
1169
|
+
tokenizer: PreTrainedTokenizerBase,
|
1170
|
+
apply_chat_template: bool = True,
|
1171
|
+
image_resolution: str = "1080p",
|
1172
|
+
) -> List[DatasetRow]:
|
1173
|
+
"""Generate requests with random images.
|
1174
|
+
|
1175
|
+
- Each request includes ``num_images`` random images.
|
1176
|
+
- Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
|
1177
|
+
or custom 'heightxwidth' (e.g., 1080x1920).
|
1178
|
+
- Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
|
1179
|
+
only counts text tokens and excludes image data.
|
1180
|
+
"""
|
1181
|
+
try:
|
1182
|
+
import pybase64
|
1183
|
+
from PIL import Image
|
1184
|
+
except ImportError as e:
|
1185
|
+
raise ImportError(
|
1186
|
+
"Please install Pillow to generate random images: pip install pillow"
|
1187
|
+
) from e
|
1188
|
+
|
1189
|
+
# Parse resolution (supports presets and 'heightxwidth')
|
1190
|
+
width, height = parse_random_image_resolution(image_resolution)
|
1191
|
+
|
1192
|
+
# Check for potentially problematic combinations and warn user
|
1193
|
+
if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
|
1194
|
+
warnings.warn(
|
1195
|
+
f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
|
1196
|
+
f"may take a long time. Consider reducing resolution or image count.",
|
1197
|
+
UserWarning,
|
1198
|
+
stacklevel=2,
|
1199
|
+
)
|
1200
|
+
|
1201
|
+
# Sample text lengths
|
1202
|
+
input_lens = np.random.randint(
|
1203
|
+
max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
|
1204
|
+
)
|
1205
|
+
output_lens = np.random.randint(
|
1206
|
+
int(output_len * range_ratio), output_len + 1, size=num_requests
|
1207
|
+
)
|
1208
|
+
|
1209
|
+
def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
|
1210
|
+
arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
|
1211
|
+
img = Image.fromarray(arr, mode="RGB")
|
1212
|
+
buf = io.BytesIO()
|
1213
|
+
img.save(buf, format="JPEG", quality=85)
|
1214
|
+
encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
|
1215
|
+
return f"data:image/jpeg;base64,{encoded}"
|
1216
|
+
|
1217
|
+
dataset: List[DatasetRow] = []
|
1218
|
+
for i in range(num_requests):
|
1219
|
+
# Generate text prompt
|
1220
|
+
text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
|
1221
|
+
|
1222
|
+
# Generate image list
|
1223
|
+
images = [_gen_random_image_data_uri() for _ in range(num_images)]
|
1224
|
+
|
1225
|
+
prompt_str = text_prompt
|
1226
|
+
if apply_chat_template:
|
1227
|
+
try:
|
1228
|
+
content_items = [
|
1229
|
+
{"type": "image_url", "image_url": {"url": img_url}}
|
1230
|
+
for img_url in images
|
1231
|
+
]
|
1232
|
+
content_items.append({"type": "text", "text": text_prompt})
|
1233
|
+
prompt_str = tokenizer.apply_chat_template(
|
1234
|
+
[{"role": "user", "content": content_items}],
|
1235
|
+
add_generation_prompt=True,
|
1236
|
+
tokenize=False,
|
1237
|
+
)
|
1238
|
+
except Exception:
|
1239
|
+
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
1240
|
+
prompt_str = f"<image>{text_prompt}"
|
1241
|
+
|
1242
|
+
prompt_token_ids = tokenizer.encode(prompt_str)
|
1243
|
+
prompt_token_len = len(prompt_token_ids)
|
1244
|
+
|
1245
|
+
dataset.append(
|
1246
|
+
DatasetRow(
|
1247
|
+
prompt=prompt_str,
|
1248
|
+
prompt_len=prompt_token_len,
|
1249
|
+
output_len=int(output_lens[i]),
|
1250
|
+
image_data=images,
|
1251
|
+
)
|
1252
|
+
)
|
1253
|
+
|
1254
|
+
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
|
1255
|
+
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
|
1256
|
+
return dataset
|
1257
|
+
|
1258
|
+
|
1116
1259
|
def gen_prompt(tokenizer, token_num):
|
1117
1260
|
"""Generate a random prompt of specified token length using tokenizer vocabulary."""
|
1118
1261
|
all_available_tokens = list(tokenizer.get_vocab().values())
|
@@ -1579,7 +1722,13 @@ async def benchmark(
|
|
1579
1722
|
output_file_name = args.output_file
|
1580
1723
|
else:
|
1581
1724
|
now = datetime.now().strftime("%m%d")
|
1582
|
-
if args.dataset_name
|
1725
|
+
if args.dataset_name == "random-image":
|
1726
|
+
output_file_name = (
|
1727
|
+
f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
|
1728
|
+
f"{args.random_output_len}_{args.random_image_num_images}imgs_"
|
1729
|
+
f"{args.random_image_resolution}.jsonl"
|
1730
|
+
)
|
1731
|
+
elif args.dataset_name.startswith("random"):
|
1583
1732
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
1584
1733
|
else:
|
1585
1734
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
|
@@ -1819,7 +1968,14 @@ if __name__ == "__main__":
|
|
1819
1968
|
"--dataset-name",
|
1820
1969
|
type=str,
|
1821
1970
|
default="sharegpt",
|
1822
|
-
choices=[
|
1971
|
+
choices=[
|
1972
|
+
"sharegpt",
|
1973
|
+
"random",
|
1974
|
+
"random-ids",
|
1975
|
+
"generated-shared-prefix",
|
1976
|
+
"mmmu",
|
1977
|
+
"random-image",
|
1978
|
+
],
|
1823
1979
|
help="Name of the dataset to benchmark on.",
|
1824
1980
|
)
|
1825
1981
|
parser.add_argument(
|
@@ -1872,6 +2028,22 @@ if __name__ == "__main__":
|
|
1872
2028
|
help="Range of sampled ratio of input/output length, "
|
1873
2029
|
"used only for random dataset.",
|
1874
2030
|
)
|
2031
|
+
# random-image dataset args
|
2032
|
+
parser.add_argument(
|
2033
|
+
"--random-image-num-images",
|
2034
|
+
type=int,
|
2035
|
+
default=1,
|
2036
|
+
help="Number of images per request (only available with the random-image dataset)",
|
2037
|
+
)
|
2038
|
+
parser.add_argument(
|
2039
|
+
"--random-image-resolution",
|
2040
|
+
type=str,
|
2041
|
+
default="1080p",
|
2042
|
+
help=(
|
2043
|
+
"Resolution of random images for random-image dataset. "
|
2044
|
+
"Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
|
2045
|
+
),
|
2046
|
+
)
|
1875
2047
|
parser.add_argument(
|
1876
2048
|
"--request-rate",
|
1877
2049
|
type=float,
|
@@ -5,6 +5,7 @@ from sglang.srt.configs.exaone import ExaoneConfig
|
|
5
5
|
from sglang.srt.configs.janus_pro import MultiModalityConfig
|
6
6
|
from sglang.srt.configs.kimi_vl import KimiVLConfig
|
7
7
|
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
|
8
|
+
from sglang.srt.configs.longcat_flash import LongcatFlashConfig
|
8
9
|
from sglang.srt.configs.step3_vl import (
|
9
10
|
Step3TextConfig,
|
10
11
|
Step3VisionEncoderConfig,
|
@@ -16,6 +17,7 @@ __all__ = [
|
|
16
17
|
"ChatGLMConfig",
|
17
18
|
"DbrxConfig",
|
18
19
|
"DeepseekVL2Config",
|
20
|
+
"LongcatFlashConfig",
|
19
21
|
"MultiModalityConfig",
|
20
22
|
"KimiVLConfig",
|
21
23
|
"MoonViTConfig",
|