sglang 0.4.4.post4__tar.gz → 0.4.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (592) hide show
  1. {sglang-0.4.4.post4/sglang.egg-info → sglang-0.4.5}/PKG-INFO +1 -1
  2. {sglang-0.4.4.post4 → sglang-0.4.5}/pyproject.toml +1 -1
  3. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/chat_template.py +24 -0
  4. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/model_config.py +4 -0
  5. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/conversation.py +29 -4
  6. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/flashattention_backend.py +286 -9
  7. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_native.py +5 -0
  8. sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  9. sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  10. sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  11. sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  12. sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  13. sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  14. sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  15. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -3
  16. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  17. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/__init__.py +1 -0
  18. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  19. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/fp8.py +3 -1
  20. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/moe_wna16.py +2 -0
  21. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/w8a8_int8.py +2 -0
  22. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/radix_attention.py +2 -0
  23. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/rotary_embedding.py +63 -0
  24. sglang-0.4.5/sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
  25. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/model_executor/model_runner.py +1 -0
  26. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/llama.py +12 -4
  27. sglang-0.4.5/sglang/srt/models/llama4.py +420 -0
  28. sglang-0.4.5/sglang/srt/models/mllama4.py +154 -0
  29. sglang-0.4.5/sglang/version.py +1 -0
  30. {sglang-0.4.4.post4 → sglang-0.4.5/sglang.egg-info}/PKG-INFO +1 -1
  31. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang.egg-info/SOURCES.txt +10 -0
  32. sglang-0.4.4.post4/sglang/version.py +0 -1
  33. {sglang-0.4.4.post4 → sglang-0.4.5}/LICENSE +0 -0
  34. {sglang-0.4.4.post4 → sglang-0.4.5}/README.md +0 -0
  35. {sglang-0.4.4.post4 → sglang-0.4.5}/setup.cfg +0 -0
  36. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/__init__.py +0 -0
  37. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/api.py +0 -0
  38. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/bench_offline_throughput.py +0 -0
  39. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/bench_one_batch.py +0 -0
  40. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/bench_one_batch_server.py +0 -0
  41. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/bench_serving.py +0 -0
  42. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/check_env.py +0 -0
  43. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/global_config.py +0 -0
  44. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/__init__.py +0 -0
  45. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/backend/__init__.py +0 -0
  46. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/backend/anthropic.py +0 -0
  47. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/backend/base_backend.py +0 -0
  48. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/backend/litellm.py +0 -0
  49. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/backend/openai.py +0 -0
  50. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/backend/runtime_endpoint.py +0 -0
  51. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/backend/vertexai.py +0 -0
  52. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/choices.py +0 -0
  53. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/compiler.py +0 -0
  54. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/interpreter.py +0 -0
  55. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/ir.py +0 -0
  56. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/lang/tracer.py +0 -0
  57. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/launch_server.py +0 -0
  58. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/llama3_eval.py +0 -0
  59. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/_custom_ops.py +0 -0
  60. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/aio_rwlock.py +0 -0
  61. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/code_completion_parser.py +0 -0
  62. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/__init__.py +0 -0
  63. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/chatglm.py +0 -0
  64. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/dbrx.py +0 -0
  65. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/deepseekvl2.py +0 -0
  66. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/device_config.py +0 -0
  67. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/exaone.py +0 -0
  68. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/janus_pro.py +0 -0
  69. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/load_config.py +0 -0
  70. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/configs/utils.py +0 -0
  71. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/connector/__init__.py +0 -0
  72. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/connector/base_connector.py +0 -0
  73. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/connector/redis.py +0 -0
  74. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/connector/s3.py +0 -0
  75. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/connector/serde/__init__.py +0 -0
  76. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/connector/serde/safe_serde.py +0 -0
  77. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/connector/serde/serde.py +0 -0
  78. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/connector/utils.py +0 -0
  79. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  80. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/constrained/llguidance_backend.py +0 -0
  81. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/constrained/outlines_backend.py +0 -0
  82. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  83. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  84. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/custom_op.py +0 -0
  85. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/disaggregation/conn.py +0 -0
  86. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/disaggregation/decode.py +0 -0
  87. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/disaggregation/mini_lb.py +0 -0
  88. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/disaggregation/prefill.py +0 -0
  89. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/disaggregation/utils.py +0 -0
  90. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/__init__.py +0 -0
  91. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/communication_op.py +0 -0
  92. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  93. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  94. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  95. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  96. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  97. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  98. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  99. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  100. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/parallel_state.py +0 -0
  101. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/distributed/utils.py +0 -0
  102. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/entrypoints/engine.py +0 -0
  103. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/entrypoints/http_server.py +0 -0
  104. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/entrypoints/verl_engine.py +0 -0
  105. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/function_call_parser.py +0 -0
  106. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/hf_transformers_utils.py +0 -0
  107. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/activation.py +0 -0
  108. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  109. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  110. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  111. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  112. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
  113. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  114. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/triton_backend.py +0 -0
  115. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  116. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  117. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  118. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  119. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  120. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/utils.py +0 -0
  121. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/attention/vision.py +0 -0
  122. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/dp_attention.py +0 -0
  123. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/elementwise.py +0 -0
  124. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/layernorm.py +0 -0
  125. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/linear.py +0 -0
  126. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/logits_processor.py +0 -0
  127. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  128. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  129. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  130. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +0 -0
  131. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  132. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  133. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  134. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  135. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  136. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  137. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  138. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  139. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  140. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  141. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  142. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  143. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  144. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  145. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  146. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  147. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  148. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  149. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  150. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  151. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  152. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  153. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  154. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  155. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  156. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  157. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  158. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  159. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  160. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  161. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  162. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  163. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  164. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  165. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  166. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  167. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  168. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  169. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  170. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  171. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  172. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  173. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  174. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  175. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  176. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  177. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  178. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  179. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  180. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  181. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  182. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  183. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  184. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  185. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  186. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  187. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  188. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  189. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  190. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  191. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  192. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  193. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  194. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  195. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  196. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  197. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  198. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  199. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  200. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  201. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  202. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  203. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  204. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  205. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  206. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  207. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  208. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  209. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  210. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  211. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  212. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  213. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  214. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  215. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  216. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  217. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  218. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  219. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  220. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  221. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  222. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  223. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  224. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  225. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  226. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  227. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  228. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  229. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  230. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  231. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  232. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  233. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  234. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  235. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  236. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  237. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  238. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  239. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  240. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  241. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  242. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  243. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  244. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  245. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  246. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  247. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  248. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  249. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  250. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  251. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  252. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  253. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  254. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/router.py +0 -0
  255. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/moe/topk.py +0 -0
  256. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/parameter.py +0 -0
  257. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/pooler.py +0 -0
  258. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/awq.py +0 -0
  259. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/base_config.py +0 -0
  260. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  261. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
  262. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  263. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  264. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  265. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  266. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  267. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  268. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  269. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  270. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  271. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  272. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  273. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  274. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  275. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  276. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  277. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  278. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  279. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  280. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  281. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  282. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  283. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  284. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  285. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  286. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  287. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  288. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  289. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  290. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  291. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  292. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  293. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  294. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  295. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  296. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  297. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  298. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  299. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  300. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  301. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  302. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  303. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  304. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  305. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  306. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  307. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  308. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  309. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  310. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  311. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  312. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  313. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  314. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  315. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  316. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  317. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  318. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  319. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  320. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  321. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  322. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  323. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  324. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  325. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  326. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  327. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  328. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  329. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  330. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  331. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  332. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  333. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  334. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  335. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  336. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  337. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  338. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  339. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  340. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  341. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  342. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  343. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  344. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  345. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  346. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  347. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  348. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  349. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  350. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  351. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  352. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  353. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  354. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  355. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  357. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  358. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  364. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  368. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  372. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  420. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  421. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/gptq.py +0 -0
  422. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  423. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  424. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/kv_cache.py +0 -0
  425. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  426. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/utils.py +0 -0
  427. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
  428. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/sampler.py +0 -0
  429. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/torchao_utils.py +0 -0
  430. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  431. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/backend/__init__.py +0 -0
  432. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/backend/base_backend.py +0 -0
  433. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  434. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/backend/triton_backend.py +0 -0
  435. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/layers.py +0 -0
  436. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/lora.py +0 -0
  437. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/lora_config.py +0 -0
  438. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/lora_manager.py +0 -0
  439. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/mem_pool.py +0 -0
  440. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  441. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  442. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  443. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  444. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  445. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/lora/utils.py +0 -0
  446. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/cache_controller.py +0 -0
  447. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/configure_logging.py +0 -0
  448. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/data_parallel_controller.py +0 -0
  449. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/detokenizer_manager.py +0 -0
  450. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/expert_distribution.py +0 -0
  451. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/io_struct.py +0 -0
  452. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/mm_utils.py +0 -0
  453. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processor.py +0 -0
  454. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processors/base_processor.py +0 -0
  455. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
  456. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +0 -0
  457. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processors/gemma3.py +0 -0
  458. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processors/janus_pro.py +0 -0
  459. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
  460. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processors/minicpm.py +0 -0
  461. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
  462. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/multimodal_processors/qwen_vl.py +0 -0
  463. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/schedule_batch.py +0 -0
  464. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/schedule_policy.py +0 -0
  465. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/scheduler.py +0 -0
  466. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
  467. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/session_controller.py +0 -0
  468. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/tokenizer_manager.py +0 -0
  469. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/tp_worker.py +0 -0
  470. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  471. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/managers/utils.py +0 -0
  472. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  473. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  474. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/mem_cache/flush_cache.py +0 -0
  475. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
  476. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/mem_cache/memory_pool.py +0 -0
  477. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/mem_cache/paged_allocator.py +0 -0
  478. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/mem_cache/radix_cache.py +0 -0
  479. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/metrics/collector.py +0 -0
  480. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/metrics/func_timer.py +0 -0
  481. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/mm_utils.py +0 -0
  482. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
  483. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  484. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/model_loader/__init__.py +0 -0
  485. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/model_loader/loader.py +0 -0
  486. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/model_loader/utils.py +0 -0
  487. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/model_loader/weight_utils.py +0 -0
  488. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/model_parallel.py +0 -0
  489. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/baichuan.py +0 -0
  490. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/chatglm.py +0 -0
  491. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/clip.py +0 -0
  492. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/commandr.py +0 -0
  493. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/dbrx.py +0 -0
  494. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/deepseek.py +0 -0
  495. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  496. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/deepseek_nextn.py +0 -0
  497. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/deepseek_v2.py +0 -0
  498. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/deepseek_vl2.py +0 -0
  499. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/exaone.py +0 -0
  500. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/gemma.py +0 -0
  501. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/gemma2.py +0 -0
  502. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/gemma2_reward.py +0 -0
  503. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/gemma3_causal.py +0 -0
  504. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/gemma3_mm.py +0 -0
  505. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/gpt2.py +0 -0
  506. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/gpt_bigcode.py +0 -0
  507. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/granite.py +0 -0
  508. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/grok.py +0 -0
  509. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/internlm2.py +0 -0
  510. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/internlm2_reward.py +0 -0
  511. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/llama_classification.py +0 -0
  512. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/llama_eagle.py +0 -0
  513. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/llama_eagle3.py +0 -0
  514. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/llama_embedding.py +0 -0
  515. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/llama_reward.py +0 -0
  516. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/llava.py +0 -0
  517. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/llavavid.py +0 -0
  518. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/minicpm.py +0 -0
  519. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/minicpm3.py +0 -0
  520. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/minicpmo.py +0 -0
  521. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/minicpmv.py +0 -0
  522. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/mistral.py +0 -0
  523. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/mixtral.py +0 -0
  524. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/mixtral_quant.py +0 -0
  525. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/mllama.py +0 -0
  526. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/olmo.py +0 -0
  527. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/olmo2.py +0 -0
  528. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/olmoe.py +0 -0
  529. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/phi3_small.py +0 -0
  530. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/qwen.py +0 -0
  531. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/qwen2.py +0 -0
  532. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/qwen2_5_vl.py +0 -0
  533. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/qwen2_classification.py +0 -0
  534. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/qwen2_eagle.py +0 -0
  535. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/qwen2_moe.py +0 -0
  536. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/qwen2_rm.py +0 -0
  537. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/qwen2_vl.py +0 -0
  538. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/registry.py +0 -0
  539. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/stablelm.py +0 -0
  540. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/torch_native_llama.py +0 -0
  541. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/xverse.py +0 -0
  542. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/xverse_moe.py +0 -0
  543. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/models/yivl.py +0 -0
  544. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/openai_api/adapter.py +0 -0
  545. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/openai_api/protocol.py +0 -0
  546. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/patch_torch.py +0 -0
  547. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/platforms/interface.py +0 -0
  548. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/reasoning_parser.py +0 -0
  549. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  550. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  551. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  552. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  553. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  554. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  555. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  556. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/sampling/sampling_params.py +0 -0
  557. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/server.py +0 -0
  558. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/server_args.py +0 -0
  559. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  560. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
  561. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/speculative/eagle_utils.py +0 -0
  562. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/speculative/eagle_worker.py +0 -0
  563. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/speculative/spec_info.py +0 -0
  564. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  565. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/utils.py +0 -0
  566. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/srt/warmup.py +0 -0
  567. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/__init__.py +0 -0
  568. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/attention/__init__.py +0 -0
  569. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/attention/test_flashattn_backend.py +0 -0
  570. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/few_shot_gsm8k.py +0 -0
  571. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  572. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/run_eval.py +0 -0
  573. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/runners.py +0 -0
  574. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/send_one.py +0 -0
  575. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/simple_eval_common.py +0 -0
  576. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/simple_eval_gpqa.py +0 -0
  577. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/simple_eval_humaneval.py +0 -0
  578. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/simple_eval_math.py +0 -0
  579. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/simple_eval_mgsm.py +0 -0
  580. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/simple_eval_mmlu.py +0 -0
  581. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/test_activation.py +0 -0
  582. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/test_block_fp8.py +0 -0
  583. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/test_block_fp8_ep.py +0 -0
  584. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/test_custom_ops.py +0 -0
  585. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/test_dynamic_grad_mode.py +0 -0
  586. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/test_layernorm.py +0 -0
  587. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/test_programs.py +0 -0
  588. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/test/test_utils.py +0 -0
  589. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang/utils.py +0 -0
  590. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang.egg-info/dependency_links.txt +0 -0
  591. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang.egg-info/requires.txt +0 -0
  592. {sglang-0.4.4.post4 → sglang-0.4.5}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.4.post4
3
+ Version: 0.4.5
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.4.post4"
7
+ version = "0.4.5"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -294,6 +294,30 @@ register_chat_template(
294
294
  )
295
295
  )
296
296
 
297
+ # Reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
298
+ register_chat_template(
299
+ ChatTemplate(
300
+ name="llama-4",
301
+ default_system_prompt=None,
302
+ role_prefix_and_suffix={
303
+ "system": (
304
+ "<|header_start|>system<|header_end|>\n\n",
305
+ "<|eot|>",
306
+ ),
307
+ "user": (
308
+ "<|header_start|>user<|header_end|>\n\n",
309
+ "<|eot|>",
310
+ ),
311
+ "assistant": (
312
+ "<|header_start|>assistant<|header_end|>\n\n",
313
+ "<|eot|>",
314
+ ),
315
+ },
316
+ stop_str=("<|eot|>",),
317
+ image_token="<|image|>",
318
+ )
319
+ )
320
+
297
321
  # Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
298
322
  register_chat_template(
299
323
  ChatTemplate(
@@ -65,6 +65,9 @@ class ModelConfig:
65
65
  **kwargs,
66
66
  )
67
67
  self.hf_text_config = get_hf_text_config(self.hf_config)
68
+ self.attention_chunk_size = getattr(
69
+ self.hf_text_config, "attention_chunk_size", None
70
+ )
68
71
 
69
72
  # Check model type
70
73
  self.is_generation = is_generation_model(
@@ -467,6 +470,7 @@ multimodal_model_archs = [
467
470
  "Gemma3ForConditionalGeneration",
468
471
  "Grok1VForCausalLM",
469
472
  "Grok1AForCausalLM",
473
+ # TODO: add multimodal support for "Llama4ForConditionalGeneration",
470
474
  "LlavaLlamaForCausalLM",
471
475
  "LlavaMistralForCausalLM",
472
476
  "LlavaQwenForCausalLM",
@@ -33,6 +33,7 @@ class SeparatorStyle(IntEnum):
33
33
  ADD_NEW_LINE_SINGLE = auto()
34
34
  LLAMA2 = auto()
35
35
  LLAMA3 = auto()
36
+ LLAMA4 = auto()
36
37
  CHATGLM = auto()
37
38
  CHATML = auto()
38
39
  CHATINTERN = auto()
@@ -156,19 +157,30 @@ class Conversation:
156
157
  else:
157
158
  ret += role + ":"
158
159
  return ret
160
+ elif self.sep_style == SeparatorStyle.LLAMA4:
161
+ # begin_of_text is added by default
162
+ if self.system_message:
163
+ ret = system_prompt
164
+ else:
165
+ ret = ""
166
+ for i, (role, message) in enumerate(self.messages):
167
+ if message:
168
+ ret += f"<|header_start|>{role}<|header_end|>\n\n"
169
+ ret += f"{message.strip()}<|eot|>"
170
+ else:
171
+ ret += f"<|header_start|>{role}<|header_end|>\n\n"
172
+ return ret
159
173
  elif self.sep_style == SeparatorStyle.LLAMA3:
160
- ret = "<|begin_of_text|>"
161
174
  if self.system_message:
162
- ret += system_prompt
175
+ ret = system_prompt
163
176
  else:
164
- ret += ""
177
+ ret = ""
165
178
  for i, (role, message) in enumerate(self.messages):
166
179
  if message:
167
180
  ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
168
181
  ret += f"{message.strip()}<|eot_id|>"
169
182
  else:
170
183
  ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
171
- # print(ret)
172
184
  return ret
173
185
  elif self.sep_style == SeparatorStyle.LLAMA2:
174
186
  seps = [self.sep, self.sep2]
@@ -561,6 +573,19 @@ register_conv_template(
561
573
  )
562
574
  )
563
575
 
576
+ # reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
577
+ register_conv_template(
578
+ Conversation(
579
+ name="llama-4",
580
+ system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
581
+ roles=("user", "assistant"),
582
+ sep_style=SeparatorStyle.LLAMA4,
583
+ sep="",
584
+ stop_str=["<|end_of_text|>", "<|eot|>", "<|eom|>"],
585
+ image_token="<|image|>",
586
+ )
587
+ )
588
+
564
589
  register_conv_template(
565
590
  Conversation(
566
591
  name="chatml",
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import numpy as np
4
+
3
5
  from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
4
6
 
5
7
  """
@@ -45,6 +47,206 @@ class FlashAttentionMetadata:
45
47
  # Sequence lengths for the forward batch
46
48
  cache_seqlens_int32: torch.Tensor = None
47
49
 
50
+ @dataclass
51
+ class LocalAttentionMetadata:
52
+ local_query_start_loc: torch.Tensor = None # cu_seqlens_q for local attention
53
+ local_seqused_k: torch.Tensor = None # sequence lengths for local attention
54
+ local_block_table: torch.Tensor = None # block table for local attention
55
+ local_max_query_len: int = 0 # max query length for local attention
56
+ local_max_seq_len: int = 0 # max sequence length for local attention
57
+
58
+ local_attn_metadata: Optional[LocalAttentionMetadata] = None
59
+
60
+
61
+ # Copied from:
62
+ # https://github.com/houseroad/vllm/blob/4e45bfcaf928bdb9bd952b4ac922a3c205589ae8/vllm/v1/attention/backends/flash_attn.py
63
+ #
64
+ # Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
65
+ # local attention blocks, where each block is passed to the attention kernel
66
+ # as an independent local ("virtual") batch item.
67
+ #
68
+ # For example, if are performing a chunked prefill a batch of 3 sequences:
69
+ # q_seqlens = [4, 10, 5]
70
+ # kv_seqlens = [6, 17, 9]
71
+ # Then normally for regular attention we would compute with an attention mask
72
+ # for batch idx 0 (q_seqlens = 4, kv_seqlens = 6) like:
73
+ # batch idx: 0 (q_seqlens = 4, kv_seqlens = 6)
74
+ # k_toks > 0 1 2 3 4 5
75
+ # q_toks v _____________
76
+ # 0 | 1 1 1
77
+ # 1 | 1 1 1 1
78
+ # 2 | 1 1 1 1 1
79
+ # 3 | 1 1 1 1 1 1
80
+ #
81
+ # for local attention (with attn_chunk_size = 4) we would compute with an
82
+ # attention mask like:
83
+ # batch idx: 0 (q_seqlens = 4, kv_seqlens = 6, attn_chunk_size = 4)
84
+ # k_toks > 0 1 2 3 4 5
85
+ # q_toks v _____________
86
+ # 0 | 1 1 1
87
+ # 1 | 1 1 1 1
88
+ # 2 | 1
89
+ # 3 | 1 1
90
+ #
91
+ # We can simulate this mask using standard flash-attention by breaking the
92
+ # sequences into local ("virtual") batches, where each local batch item is a
93
+ # local attention block, so in this case batch idx 0 would be broken up into:
94
+ #
95
+ # local-batch idx: 0 (q_seqlens = 2, kv_seqlens = 4) (batch 0)
96
+ # k_toks > 0 1 2 3
97
+ # q_toks v _____________
98
+ # 0 | 1 1 1
99
+ # 1 | 1 1 1 1
100
+ # local-batch idx: 1 (q_seqlens = 2, kv_seqlens = 2) (batch 0)
101
+ # k_toks > 4 5
102
+ # q_toks v _____________
103
+ # 2 | 1
104
+ # 3 | 1 1
105
+ #
106
+ # e.g. if we have:
107
+ # attn_chunk_size = 4
108
+ # query_start_loc_np = [0, 4, 14, 19] (q_seqlens = [4, 10, 5])
109
+ # Then this function would return:
110
+ # __b0__ ______b1______ __b2__ < orig batch indices
111
+ # q_seqlens_local = [ 2, 2, 1, 4, 4, 1, 4, 1]
112
+ # cu_seqlens_q_local = [0, 4, 6, 10, 14, 18, 19, 23, 24]
113
+ # seqlens_k_local = [ 4, 2, 4, 4, 4, 1, 4, 1]
114
+ # block_table_local : shape[local_virtual_batches, pages_per_local_batch]
115
+ def make_local_attention_virtual_batches(
116
+ attn_chunk_size: int,
117
+ query_start_loc_np: np.ndarray,
118
+ seq_lens_np: np.ndarray,
119
+ block_table: torch.Tensor,
120
+ page_size: int = 0,
121
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray, torch.Tensor]:
122
+ """
123
+ Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
124
+ local attention blocks, where each block is passed to the attention kernel
125
+ as an independent local ("virtual") batch item.
126
+
127
+ Args:
128
+ attn_chunk_size: Size of local attention chunks
129
+ query_start_loc_np: Cumulative sum of query lengths (numpy array)
130
+ seq_lens_np: Sequence lengths (numpy array)
131
+ block_table: Block table for KV cache
132
+ page_size: Size of each page in the KV cache
133
+
134
+ Returns:
135
+ seqlens_q_local: Query sequence lengths for local attention
136
+ cu_seqlens_q_local: Cumulative sum of query sequence lengths for local attention
137
+ seqlens_k_local: Key sequence lengths for local attention
138
+ block_table_local: Block table for local attention
139
+ """
140
+ q_seqlens = query_start_loc_np[1:] - query_start_loc_np[:-1]
141
+ actual_batch_size = seq_lens_np.shape[0]
142
+
143
+ # Handle if we are starting in the middle of a local attention block,
144
+ # we assume q_seqlens > 0 (for all elements), for each batch idx we compute
145
+ # the number of tokens that are not in the first local attention block and
146
+ # then we can simply use a cdiv for the rest.
147
+ # For example if we have:
148
+ # attn_chunk_size = 4
149
+ # q_seqlens = [4, 10, 5]
150
+ # k_seqlens = [6, 17, 9]
151
+ # Then we would get:
152
+ # new_tokens_in_first_block = [2, 1, 4]
153
+ # local_blocks = [2, 4, 2]
154
+ q_tokens_in_first_block = np.minimum(
155
+ attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size), q_seqlens
156
+ ).astype(np.int32)
157
+ tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size)
158
+ local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block, attn_chunk_size)
159
+
160
+ # Once we know the number of local blocks we can compute the request spans
161
+ # for each batch idx, we can figure out the number of "virtual" requests we
162
+ # have to make,
163
+ # For the above example we would get:
164
+ # seqlens_q_local = [2, 2, 1, 4, 4, 1, 4, 1]
165
+ #
166
+ # First Get batched arange. (E.g., [2, 4, 2] -> [0, 1, 0, 1, 2, 3, 0, 1])
167
+ # (TODO: max a utility to share this code with _prepare_inputs)
168
+ # arange step 1. [2, 4, 2] -> [2, 6, 8]
169
+ cu_num_blocks = np.cumsum(local_blocks)
170
+ virtual_batches = cu_num_blocks[-1]
171
+ # arange step 2. [2, 6, 8] -> [0, 0, 2, 2, 2, 2, 6, 6]
172
+ block_offsets = np.repeat(cu_num_blocks - local_blocks, local_blocks)
173
+ # arange step 3. [0, 1, 0, 1, 2, 3, 0, 1]
174
+ arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
175
+ # also compute reverse arange (i.e. [1, 0, 3, 2, 1, 0, 1, 0])
176
+ rarange = np.repeat(local_blocks, local_blocks) - arange - 1
177
+ # Then we can compute the seqlens_q_local, handling the fact that the
178
+ # first and last blocks could be partial
179
+ seqlens_q_local = np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks)
180
+ # set the first block since this may be a partial block
181
+ seqlens_q_local[arange == 0] = q_tokens_in_first_block
182
+ # set the remaining blocks
183
+ seqlens_q_local[arange > 0] = np.minimum(
184
+ seqlens_q_local - attn_chunk_size * (arange - 1), attn_chunk_size
185
+ )[arange > 0]
186
+
187
+ # convert from q_seqlens to cu_seqlens_q
188
+ cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0)).astype(np.int32)
189
+
190
+ # compute the seqlens_k_local,
191
+ # basically a full local attention block for all but the last block in each
192
+ # batch
193
+ # For our example this will be:
194
+ # seqlens_k_local = [4, 2, 4, 4, 4, 1, 4, 1]
195
+ seqlens_k_local = np.full(cu_num_blocks[-1], attn_chunk_size, dtype=np.int32)
196
+ seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block
197
+
198
+ k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - (
199
+ rarange * attn_chunk_size + np.repeat(tokens_in_last_block, local_blocks)
200
+ )
201
+ # For the example the local attention blocks start at:
202
+ # _b0_ _____b1_____ _b2_
203
+ # k_seqstarts_absolute = [0, 4, 4, 8, 12, 16, 4, 8]
204
+ block_starts = k_seqstarts_absolute // page_size
205
+
206
+ assert attn_chunk_size % page_size == 0, (
207
+ f"attn_chunk_size {attn_chunk_size} is not "
208
+ f"divisible by page_size {page_size}"
209
+ )
210
+ pages_per_local_batch = attn_chunk_size // page_size
211
+
212
+ # Create a block_table for the local attention blocks
213
+ # For out example if we have a block-table like (assuming page_size=2):
214
+ # block_table = [
215
+ # [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], < batch 0
216
+ # [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], < batch 1
217
+ # [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], < batch 2
218
+ # ]
219
+ # Then for the local batches we would want a block-table like
220
+ # block_table_local = [
221
+ # [ 0, 1 ], < local-batch 0, (batch 0, starting from k[0])
222
+ # [ 2, 3 ], < local-batch 1, (batch 0, starting from k[4])
223
+ # [ 12, 13 ], < local-batch 2, (batch 1, starting from k[4])
224
+ # [ 14, 15 ], < local-batch 3, (batch 1, starting from k[8])
225
+ # [ 16, 17 ], < local-batch 4, (batch 1, starting from k[12])
226
+ # [ 18, 19 ], < local-batch 5, (batch 1, starting from k[16])
227
+ # [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4])
228
+ # [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8])
229
+ # ]
230
+ block_indices = np.broadcast_to(
231
+ np.arange(pages_per_local_batch, dtype=np.int32),
232
+ (virtual_batches, pages_per_local_batch),
233
+ ) + np.expand_dims(block_starts, axis=1)
234
+ block_indices = block_indices.flatten()
235
+ batch_indices = np.repeat(
236
+ np.arange(actual_batch_size, dtype=np.int32),
237
+ local_blocks * pages_per_local_batch,
238
+ )
239
+ block_table_local = block_table[batch_indices, block_indices].view(
240
+ virtual_batches, -1
241
+ )
242
+
243
+ return seqlens_q_local, cu_seqlens_q_local, seqlens_k_local, block_table_local
244
+
245
+
246
+ def cdiv(a: int, b: int) -> int:
247
+ """Ceiling division."""
248
+ return -(a // -b)
249
+
48
250
 
49
251
  class FlashAttentionBackend(AttentionBackend):
50
252
  """FlashAttention backend implementation.
@@ -100,6 +302,13 @@ class FlashAttentionBackend(AttentionBackend):
100
302
  self.step_id = step_id
101
303
  self.speculative_num_steps = speculative_num_steps
102
304
 
305
+ # Local attention settings
306
+ self.attention_chunk_size = (
307
+ model_runner.attention_chunk_size
308
+ if hasattr(model_runner, "attention_chunk_size")
309
+ else None
310
+ )
311
+
103
312
  def init_forward_metadata(self, forward_batch: ForwardBatch):
104
313
  """Initialize forward metadata to cache repetitive calculations."""
105
314
  metadata = FlashAttentionMetadata()
@@ -189,6 +398,7 @@ class FlashAttentionBackend(AttentionBackend):
189
398
  metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
190
399
  forward_batch.req_pool_indices, : metadata.max_seq_len_k
191
400
  ]
401
+
192
402
  # Precompute cumulative sequence lengths
193
403
  if (
194
404
  any(forward_batch.extend_prefix_lens_cpu)
@@ -203,6 +413,51 @@ class FlashAttentionBackend(AttentionBackend):
203
413
  metadata.cu_seqlens_q = metadata.cu_seqlens_k
204
414
  metadata.max_seq_len_q = metadata.max_seq_len_k
205
415
 
416
+ # Setup local attention if enabled
417
+ if (
418
+ self.attention_chunk_size is not None
419
+ and forward_batch.forward_mode == ForwardMode.EXTEND
420
+ ):
421
+ # Convert tensors to numpy for local attention processing
422
+ cu_seqlens_q_np = metadata.cu_seqlens_q.cpu().numpy()
423
+ seq_lens_np = metadata.cache_seqlens_int32.cpu().numpy()
424
+
425
+ # Adjust attention_chunk_size based on the actual sequence length
426
+ # to avoid index out of bounds errors
427
+ max_seq_len = seq_lens_np.max()
428
+ effective_chunk_size = min(self.attention_chunk_size, max_seq_len)
429
+ # Make sure effective_chunk_size is divisible by page_size
430
+ effective_chunk_size = (
431
+ effective_chunk_size // self.page_size
432
+ ) * self.page_size
433
+ if effective_chunk_size < self.page_size:
434
+ effective_chunk_size = self.page_size
435
+
436
+ # Create local attention metadata
437
+ (
438
+ seqlens_q_local_np,
439
+ cu_seqlens_q_local_np,
440
+ seqlens_k_local_np,
441
+ block_table_local,
442
+ ) = make_local_attention_virtual_batches(
443
+ effective_chunk_size,
444
+ cu_seqlens_q_np,
445
+ seq_lens_np,
446
+ metadata.page_table,
447
+ self.page_size,
448
+ )
449
+
450
+ local_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
451
+ local_query_start_loc=torch.from_numpy(cu_seqlens_q_local_np).to(
452
+ device
453
+ ),
454
+ local_seqused_k=torch.from_numpy(seqlens_k_local_np).to(device),
455
+ local_block_table=block_table_local,
456
+ local_max_query_len=seqlens_q_local_np.max(),
457
+ local_max_seq_len=seqlens_k_local_np.max(),
458
+ )
459
+ metadata.local_attn_metadata = local_metadata
460
+
206
461
  # Precompute strided indices
207
462
  if self.page_size > 1:
208
463
  self.strided_indices = torch.arange(
@@ -211,6 +466,7 @@ class FlashAttentionBackend(AttentionBackend):
211
466
  metadata.page_table = (
212
467
  metadata.page_table[:, self.strided_indices] // self.page_size
213
468
  )
469
+
214
470
  self.forward_metadata = metadata
215
471
 
216
472
  def forward_extend(
@@ -254,7 +510,28 @@ class FlashAttentionBackend(AttentionBackend):
254
510
  else (-1, -1)
255
511
  )
256
512
 
257
- page_table = metadata.page_table
513
+ # Check if we should use local attention
514
+ use_local_attn = (
515
+ self.attention_chunk_size is not None
516
+ and metadata.local_attn_metadata is not None
517
+ and (hasattr(layer, "use_irope") and layer.use_irope)
518
+ )
519
+
520
+ # Get the appropriate page table based on whether we're using local attention
521
+ if use_local_attn:
522
+ local_metadata = metadata.local_attn_metadata
523
+ page_table = local_metadata.local_block_table
524
+ cu_seqlens_q = local_metadata.local_query_start_loc
525
+ cache_seqlens = local_metadata.local_seqused_k
526
+ max_seqlen_q = local_metadata.local_max_query_len
527
+ max_seqlen_k = local_metadata.local_max_seq_len
528
+ else:
529
+ page_table = metadata.page_table
530
+ cu_seqlens_q = metadata.cu_seqlens_q
531
+ cache_seqlens = metadata.cache_seqlens_int32
532
+ max_seqlen_q = metadata.max_seq_len_q
533
+ max_seqlen_k = metadata.max_seq_len_k
534
+ cu_seqlens_k = metadata.cu_seqlens_k
258
535
 
259
536
  # Use Flash Attention for prefill
260
537
  if not self.use_mla:
@@ -272,10 +549,10 @@ class FlashAttentionBackend(AttentionBackend):
272
549
  k_cache=key_cache,
273
550
  v_cache=value_cache,
274
551
  page_table=page_table,
275
- cache_seqlens=metadata.cache_seqlens_int32,
276
- cu_seqlens_q=metadata.cu_seqlens_q,
277
- cu_seqlens_k_new=metadata.cu_seqlens_k,
278
- max_seqlen_q=metadata.max_seq_len_q,
552
+ cache_seqlens=cache_seqlens,
553
+ cu_seqlens_q=cu_seqlens_q,
554
+ cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
555
+ max_seqlen_q=max_seqlen_q,
279
556
  softmax_scale=layer.scaling,
280
557
  causal=True,
281
558
  window_size=window_size,
@@ -307,10 +584,10 @@ class FlashAttentionBackend(AttentionBackend):
307
584
  v_cache=c_kv_cache,
308
585
  qv=q_nope,
309
586
  page_table=page_table,
310
- cache_seqlens=metadata.cache_seqlens_int32,
311
- cu_seqlens_q=metadata.cu_seqlens_q,
312
- cu_seqlens_k_new=metadata.cu_seqlens_k,
313
- max_seqlen_q=metadata.max_seq_len_q,
587
+ cache_seqlens=cache_seqlens,
588
+ cu_seqlens_q=cu_seqlens_q,
589
+ cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
590
+ max_seqlen_q=max_seqlen_q,
314
591
  softmax_scale=layer.scaling,
315
592
  causal=True,
316
593
  softcap=layer.logit_cap,
@@ -23,9 +23,14 @@ def fused_moe_forward_native(
23
23
  custom_routing_function: Optional[Callable] = None,
24
24
  correction_bias: Optional[torch.Tensor] = None,
25
25
  activation: str = "silu",
26
+ apply_router_weight_on_input: bool = False,
26
27
  inplace: bool = True,
27
28
  no_combine: bool = False,
28
29
  ) -> torch.Tensor:
30
+
31
+ if apply_router_weight_on_input:
32
+ raise NotImplementedError
33
+
29
34
  topk_weights, topk_ids = select_experts(
30
35
  hidden_states=x,
31
36
  router_logits=router_logits,
@@ -0,0 +1,146 @@
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 128,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 3
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 64,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 5
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 64,
22
+ "GROUP_SIZE_M": 1,
23
+ "num_warps": 4,
24
+ "num_stages": 5
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 4,
32
+ "num_stages": 3
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 16,
39
+ "num_warps": 4,
40
+ "num_stages": 3
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 64,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 4,
48
+ "num_stages": 4
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 2
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 32,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 16,
63
+ "num_warps": 4,
64
+ "num_stages": 2
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 128,
69
+ "BLOCK_SIZE_K": 256,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 2
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 16,
79
+ "num_warps": 8,
80
+ "num_stages": 3
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 32,
84
+ "BLOCK_SIZE_N": 64,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 16,
87
+ "num_warps": 4,
88
+ "num_stages": 5
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 16,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 256,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 4,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 16,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 128,
102
+ "GROUP_SIZE_M": 16,
103
+ "num_warps": 8,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 16,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 256,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 4,
112
+ "num_stages": 2
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 32,
116
+ "BLOCK_SIZE_N": 128,
117
+ "BLOCK_SIZE_K": 128,
118
+ "GROUP_SIZE_M": 1,
119
+ "num_warps": 4,
120
+ "num_stages": 3
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 32,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 128,
126
+ "GROUP_SIZE_M": 1,
127
+ "num_warps": 4,
128
+ "num_stages": 3
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 64,
132
+ "BLOCK_SIZE_N": 64,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 1,
135
+ "num_warps": 4,
136
+ "num_stages": 3
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 64,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 128,
142
+ "GROUP_SIZE_M": 1,
143
+ "num_warps": 4,
144
+ "num_stages": 2
145
+ }
146
+ }