sglang 0.4.5.post3__tar.gz → 0.4.6.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (631) hide show
  1. {sglang-0.4.5.post3/sglang.egg-info → sglang-0.4.6.post1}/PKG-INFO +5 -6
  2. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/README.md +2 -2
  3. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/pyproject.toml +3 -4
  4. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_one_batch.py +19 -3
  5. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_serving.py +8 -9
  6. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/compile_deep_gemm.py +45 -4
  7. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/code_completion_parser.py +1 -1
  8. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/deepseekvl2.py +1 -1
  9. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/model_config.py +9 -3
  10. sglang-0.4.6.post1/sglang/srt/constrained/llguidance_backend.py +169 -0
  11. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/conversation.py +34 -1
  12. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/decode.py +67 -13
  13. sglang-0.4.6.post1/sglang/srt/disaggregation/fake/__init__.py +1 -0
  14. sglang-0.4.6.post1/sglang/srt/disaggregation/fake/conn.py +88 -0
  15. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mini_lb.py +45 -8
  16. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mooncake/conn.py +198 -31
  17. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/prefill.py +36 -12
  18. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/utils.py +16 -2
  19. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/engine.py +9 -0
  20. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/http_server.py +35 -4
  21. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/function_call_parser.py +77 -5
  22. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/base_attn_backend.py +3 -0
  23. sglang-0.4.6.post1/sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
  24. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashattention_backend.py +28 -10
  25. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashmla_backend.py +8 -11
  26. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/utils.py +1 -1
  27. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/vision.py +2 -0
  28. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/layernorm.py +38 -16
  29. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/logits_processor.py +2 -2
  30. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_native.py +2 -4
  31. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  32. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  33. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  34. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  35. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  36. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  37. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  38. sglang-0.4.5.post3/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +35 -35
  39. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  40. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  41. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  42. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  43. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  45. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +20 -17
  46. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
  47. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/pooler.py +6 -0
  48. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/awq.py +5 -1
  49. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/deep_gemm.py +17 -10
  50. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/fp8.py +20 -22
  51. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/fp8_utils.py +2 -2
  52. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/int8_kernel.py +32 -1
  53. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/radix_attention.py +13 -3
  54. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/rotary_embedding.py +170 -126
  55. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/data_parallel_controller.py +10 -3
  56. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/io_struct.py +7 -0
  57. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/mm_utils.py +85 -28
  58. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
  59. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
  60. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
  61. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
  62. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
  63. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
  64. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/schedule_batch.py +38 -12
  65. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/scheduler.py +41 -28
  66. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
  67. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/tokenizer_manager.py +5 -1
  68. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/tp_worker.py +3 -3
  69. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
  70. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/memory_pool.py +87 -0
  71. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_executor/cuda_graph_runner.py +4 -3
  72. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_executor/forward_batch_info.py +51 -95
  73. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_executor/model_runner.py +19 -25
  74. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek.py +12 -2
  75. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_nextn.py +101 -6
  76. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_v2.py +144 -70
  77. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_vl2.py +9 -4
  78. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma3_causal.py +1 -1
  79. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama4.py +0 -1
  80. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/minicpmo.py +5 -1
  81. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mllama4.py +2 -2
  82. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_5_vl.py +3 -6
  83. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_vl.py +3 -7
  84. sglang-0.4.6.post1/sglang/srt/models/roberta.py +178 -0
  85. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/openai_api/adapter.py +50 -11
  86. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/openai_api/protocol.py +2 -0
  87. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/reasoning_parser.py +25 -1
  88. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/server_args.py +31 -24
  89. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  90. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/torch_memory_saver_adapter.py +10 -1
  91. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/utils.py +5 -1
  92. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/runners.py +6 -13
  93. sglang-0.4.6.post1/sglang/test/send_one.py +144 -0
  94. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_utils.py +74 -18
  95. sglang-0.4.6.post1/sglang/version.py +1 -0
  96. {sglang-0.4.5.post3 → sglang-0.4.6.post1/sglang.egg-info}/PKG-INFO +5 -6
  97. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang.egg-info/SOURCES.txt +17 -0
  98. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang.egg-info/requires.txt +2 -2
  99. sglang-0.4.5.post3/sglang/srt/constrained/llguidance_backend.py +0 -152
  100. sglang-0.4.5.post3/sglang/test/send_one.py +0 -88
  101. sglang-0.4.5.post3/sglang/version.py +0 -1
  102. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/LICENSE +0 -0
  103. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/setup.cfg +0 -0
  104. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/__init__.py +0 -0
  105. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/api.py +0 -0
  106. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_offline_throughput.py +0 -0
  107. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_one_batch_server.py +0 -0
  108. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/check_env.py +0 -0
  109. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/global_config.py +0 -0
  110. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/__init__.py +0 -0
  111. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/anthropic.py +0 -0
  112. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/base_backend.py +0 -0
  113. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/litellm.py +0 -0
  114. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/openai.py +0 -0
  115. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  116. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/backend/vertexai.py +0 -0
  117. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/chat_template.py +0 -0
  118. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/choices.py +0 -0
  119. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/compiler.py +0 -0
  120. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/interpreter.py +0 -0
  121. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/ir.py +0 -0
  122. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/lang/tracer.py +0 -0
  123. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/launch_server.py +0 -0
  124. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/llama3_eval.py +0 -0
  125. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/_custom_ops.py +0 -0
  126. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/aio_rwlock.py +0 -0
  127. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/__init__.py +0 -0
  128. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/chatglm.py +0 -0
  129. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/dbrx.py +0 -0
  130. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/device_config.py +0 -0
  131. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/exaone.py +0 -0
  132. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/janus_pro.py +0 -0
  133. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/load_config.py +0 -0
  134. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/utils.py +0 -0
  135. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/__init__.py +0 -0
  136. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/base_connector.py +0 -0
  137. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/redis.py +0 -0
  138. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/s3.py +0 -0
  139. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/serde/__init__.py +0 -0
  140. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/serde/safe_serde.py +0 -0
  141. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/serde/serde.py +0 -0
  142. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/connector/utils.py +0 -0
  143. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  144. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
  145. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  146. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
  147. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
  148. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  149. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/custom_op.py +0 -0
  150. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/base/__init__.py +0 -0
  151. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/base/conn.py +0 -0
  152. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
  153. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
  154. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
  155. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/disaggregation/nixl/conn.py +0 -0
  156. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/__init__.py +0 -0
  157. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/communication_op.py +0 -0
  158. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  159. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  160. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  161. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  162. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  163. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  164. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  165. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  166. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/parallel_state.py +0 -0
  167. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/distributed/utils.py +0 -0
  168. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/EngineBase.py +0 -0
  169. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/http_server_engine.py +0 -0
  170. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/entrypoints/verl_engine.py +0 -0
  171. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  172. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/activation.py +0 -0
  173. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  174. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  175. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  176. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  177. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  178. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  179. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  180. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  181. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  182. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  183. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/dp_attention.py +0 -0
  184. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/elementwise.py +0 -0
  185. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/linear.py +0 -0
  186. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  187. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  188. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  189. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +0 -0
  190. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  191. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  192. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  193. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  194. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  195. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  196. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  197. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  198. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  199. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  200. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  201. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  202. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  203. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  204. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  205. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  206. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  207. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  208. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  209. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  210. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  211. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  212. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  213. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  214. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  215. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  216. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  217. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  218. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  219. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  220. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  221. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  222. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  223. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  224. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  225. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  226. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  227. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  228. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  229. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  230. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  231. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  232. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  233. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  234. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  235. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  236. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  237. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  238. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  239. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  240. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  241. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  242. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  243. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  244. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  245. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  246. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  247. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  248. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  249. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  250. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  251. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  252. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  253. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  254. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  255. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  256. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  257. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  258. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  259. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  260. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  261. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  262. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  263. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  264. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  265. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  266. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  267. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  268. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  269. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  270. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  271. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  272. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  273. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  274. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  275. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  276. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  277. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  278. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  279. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  280. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  281. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  282. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  283. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  284. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  285. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  286. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  287. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  288. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  289. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  290. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  291. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  292. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  293. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  294. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  295. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  296. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  297. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  298. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  299. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  300. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  301. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  302. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  303. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  304. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  305. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  306. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  307. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  308. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  309. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  310. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  311. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  312. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  313. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  314. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  315. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  316. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  317. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  318. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  319. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  320. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  321. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  322. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  323. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  324. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/router.py +0 -0
  325. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/moe/topk.py +0 -0
  326. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/parameter.py +0 -0
  327. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
  328. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  329. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
  330. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  331. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
  332. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  333. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  334. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  335. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
  336. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  337. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  338. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  339. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  340. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  341. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  342. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  343. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  344. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  345. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  346. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  347. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  348. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  349. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  350. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  351. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  352. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  353. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  354. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  355. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  357. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  358. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  364. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  368. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  372. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  488. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  489. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  490. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  491. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/gptq.py +0 -0
  492. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  493. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/kv_cache.py +0 -0
  494. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  495. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
  496. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/utils.py +0 -0
  497. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
  498. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
  499. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/sampler.py +0 -0
  500. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  501. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  502. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/backend/base_backend.py +0 -0
  503. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  504. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/backend/triton_backend.py +0 -0
  505. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/layers.py +0 -0
  506. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/lora.py +0 -0
  507. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/lora_config.py +0 -0
  508. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/lora_manager.py +0 -0
  509. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/mem_pool.py +0 -0
  510. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  511. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  512. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  513. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  514. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  515. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/lora/utils.py +0 -0
  516. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/cache_controller.py +0 -0
  517. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/configure_logging.py +0 -0
  518. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  519. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/expert_distribution.py +0 -0
  520. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processor.py +0 -0
  521. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
  522. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
  523. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
  524. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/mllama4.py +0 -0
  525. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  526. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/session_controller.py +0 -0
  527. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/managers/utils.py +0 -0
  528. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  529. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  530. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  531. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
  532. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/paged_allocator.py +0 -0
  533. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  534. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/metrics/collector.py +0 -0
  535. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/metrics/func_timer.py +0 -0
  536. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/mm_utils.py +0 -0
  537. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_loader/__init__.py +0 -0
  538. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_loader/loader.py +0 -0
  539. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_loader/utils.py +0 -0
  540. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_loader/weight_utils.py +0 -0
  541. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/model_parallel.py +0 -0
  542. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/baichuan.py +0 -0
  543. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/bert.py +0 -0
  544. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/chatglm.py +0 -0
  545. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/clip.py +0 -0
  546. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/commandr.py +0 -0
  547. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/dbrx.py +0 -0
  548. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  549. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/exaone.py +0 -0
  550. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma.py +0 -0
  551. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma2.py +0 -0
  552. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  553. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gemma3_mm.py +0 -0
  554. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gpt2.py +0 -0
  555. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  556. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/granite.py +0 -0
  557. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/grok.py +0 -0
  558. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/internlm2.py +0 -0
  559. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  560. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama.py +0 -0
  561. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_classification.py +0 -0
  562. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_eagle.py +0 -0
  563. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_eagle3.py +0 -0
  564. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_embedding.py +0 -0
  565. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llama_reward.py +0 -0
  566. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llava.py +0 -0
  567. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/llavavid.py +0 -0
  568. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/minicpm.py +0 -0
  569. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/minicpm3.py +0 -0
  570. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/minicpmv.py +0 -0
  571. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mistral.py +0 -0
  572. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mixtral.py +0 -0
  573. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  574. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/mllama.py +0 -0
  575. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/olmo.py +0 -0
  576. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/olmo2.py +0 -0
  577. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/olmoe.py +0 -0
  578. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/phi3_small.py +0 -0
  579. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen.py +0 -0
  580. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2.py +0 -0
  581. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_classification.py +0 -0
  582. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
  583. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  584. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_rm.py +0 -0
  585. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen3.py +0 -0
  586. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/qwen3_moe.py +0 -0
  587. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/registry.py +0 -0
  588. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/stablelm.py +0 -0
  589. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  590. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/xverse.py +0 -0
  591. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/xverse_moe.py +0 -0
  592. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/models/yivl.py +0 -0
  593. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/patch_torch.py +0 -0
  594. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/platforms/interface.py +0 -0
  595. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  596. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  597. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  598. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  599. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  600. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  601. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  602. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  603. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  604. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/eagle_utils.py +0 -0
  605. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/eagle_worker.py +0 -0
  606. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/speculative/spec_info.py +0 -0
  607. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/warmup.py +0 -0
  608. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/__init__.py +0 -0
  609. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/attention/__init__.py +0 -0
  610. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/attention/test_flashattn_backend.py +0 -0
  611. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
  612. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
  613. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  614. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  615. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/run_eval.py +0 -0
  616. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_common.py +0 -0
  617. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  618. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  619. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_math.py +0 -0
  620. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  621. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  622. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_activation.py +0 -0
  623. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_block_fp8.py +0 -0
  624. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_block_fp8_ep.py +0 -0
  625. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_custom_ops.py +0 -0
  626. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_dynamic_grad_mode.py +0 -0
  627. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_layernorm.py +0 -0
  628. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/test/test_programs.py +0 -0
  629. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/utils.py +0 -0
  630. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang.egg-info/dependency_links.txt +0 -0
  631. {sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.5.post3
3
+ Version: 0.4.6.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -225,7 +225,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
225
225
  Requires-Dist: hf_transfer; extra == "runtime-common"
226
226
  Requires-Dist: huggingface_hub; extra == "runtime-common"
227
227
  Requires-Dist: interegular; extra == "runtime-common"
228
- Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
228
+ Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
229
229
  Requires-Dist: modelscope; extra == "runtime-common"
230
230
  Requires-Dist: ninja; extra == "runtime-common"
231
231
  Requires-Dist: orjson; extra == "runtime-common"
@@ -242,11 +242,10 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
242
242
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
- Requires-Dist: compressed-tensors; extra == "runtime-common"
246
245
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
247
246
  Provides-Extra: srt
248
247
  Requires-Dist: sglang[runtime_common]; extra == "srt"
249
- Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
248
+ Requires-Dist: sgl-kernel==0.1.0; extra == "srt"
250
249
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
251
250
  Requires-Dist: torch==2.6.0; extra == "srt"
252
251
  Requires-Dist: torchvision==0.21.0; extra == "srt"
@@ -409,5 +408,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
409
408
 
410
409
  For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
411
410
 
412
- ## Acknowledgment and Citation
413
- We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
411
+ ## Acknowledgment
412
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -71,5 +71,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
71
71
 
72
72
  For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
73
73
 
74
- ## Acknowledgment and Citation
75
- We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
74
+ ## Acknowledgment
75
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.5.post3"
7
+ version = "0.4.6.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -24,7 +24,7 @@ runtime_common = [
24
24
  "hf_transfer",
25
25
  "huggingface_hub",
26
26
  "interegular",
27
- "llguidance>=0.6.15",
27
+ "llguidance>=0.7.11,<0.8.0",
28
28
  "modelscope",
29
29
  "ninja",
30
30
  "orjson",
@@ -41,13 +41,12 @@ runtime_common = [
41
41
  "transformers==4.51.1",
42
42
  "uvicorn",
43
43
  "uvloop",
44
- "compressed-tensors",
45
44
  "xgrammar==0.1.17",
46
45
  ]
47
46
 
48
47
  srt = [
49
48
  "sglang[runtime_common]",
50
- "sgl-kernel==0.0.9.post2",
49
+ "sgl-kernel==0.1.0",
51
50
  "flashinfer_python==0.2.3",
52
51
  "torch==2.6.0",
53
52
  "torchvision==0.21.0",
@@ -57,6 +57,7 @@ import torch
57
57
  import torch.distributed as dist
58
58
 
59
59
  from sglang.srt.configs.model_config import ModelConfig
60
+ from sglang.srt.distributed.parallel_state import destroy_distributed_environment
60
61
  from sglang.srt.entrypoints.engine import _set_envs_and_config
61
62
  from sglang.srt.hf_transformers_utils import get_tokenizer
62
63
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
@@ -85,6 +86,7 @@ class BenchArgs:
85
86
  correctness_test: bool = False
86
87
  # This is only used for correctness test
87
88
  cut_len: int = 4
89
+ log_decode_step: int = 0
88
90
  profile: bool = False
89
91
  profile_filename_prefix: str = "profile"
90
92
 
@@ -105,6 +107,12 @@ class BenchArgs:
105
107
  )
106
108
  parser.add_argument("--correctness-test", action="store_true")
107
109
  parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
110
+ parser.add_argument(
111
+ "--log-decode-step",
112
+ type=int,
113
+ default=BenchArgs.log_decode_step,
114
+ help="Log decode latency by step, default is set to zero to disable.",
115
+ )
108
116
  parser.add_argument(
109
117
  "--profile", action="store_true", help="Use Torch Profiler."
110
118
  )
@@ -335,6 +343,7 @@ def latency_test_run_once(
335
343
  input_len,
336
344
  output_len,
337
345
  device,
346
+ log_decode_step,
338
347
  profile,
339
348
  profile_filename_prefix,
340
349
  ):
@@ -394,9 +403,9 @@ def latency_test_run_once(
394
403
  tot_latency += latency
395
404
  throughput = batch_size / latency
396
405
  decode_latencies.append(latency)
397
- if i < 5:
406
+ if i < 5 or (log_decode_step > 0 and i % log_decode_step == 0):
398
407
  rank_print(
399
- f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
408
+ f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
400
409
  )
401
410
 
402
411
  if profile:
@@ -457,8 +466,9 @@ def latency_test(
457
466
  reqs,
458
467
  bench_args.batch_size[0],
459
468
  bench_args.input_len[0],
460
- 8, # shorter decoding to speed up the warmup
469
+ min(32, bench_args.output_len[0]), # shorter decoding to speed up the warmup
461
470
  server_args.device,
471
+ log_decode_step=0,
462
472
  profile=False,
463
473
  profile_filename_prefix="", # not used
464
474
  )
@@ -480,6 +490,7 @@ def latency_test(
480
490
  il,
481
491
  ol,
482
492
  server_args.device,
493
+ bench_args.log_decode_step,
483
494
  bench_args.profile if tp_rank == 0 else None,
484
495
  bench_args.profile_filename_prefix,
485
496
  )
@@ -492,8 +503,13 @@ def latency_test(
492
503
  for result in result_list:
493
504
  fout.write(json.dumps(result) + "\n")
494
505
 
506
+ if server_args.tp_size > 1:
507
+ destroy_distributed_environment()
508
+
495
509
 
496
510
  def main(server_args, bench_args):
511
+ server_args.cuda_graph_max_bs = max(bench_args.batch_size)
512
+
497
513
  _set_envs_and_config(server_args)
498
514
 
499
515
  if server_args.model_path:
@@ -295,7 +295,7 @@ async def async_request_truss(
295
295
  # NOTE: Some completion API might have a last
296
296
  # usage summary response without a token so we
297
297
  # want to check a token was generated
298
- if data["choices"][0]["delta"]["content"]:
298
+ if data["choices"][0]["text"]:
299
299
  timestamp = time.perf_counter()
300
300
  # First token
301
301
  if ttft == 0.0:
@@ -307,7 +307,7 @@ async def async_request_truss(
307
307
  output.itl.append(timestamp - most_recent_timestamp)
308
308
 
309
309
  most_recent_timestamp = timestamp
310
- generated_text += data["choices"][0]["delta"]["content"]
310
+ generated_text += data["choices"][0]["text"]
311
311
 
312
312
  output.generated_text = generated_text
313
313
  output.success = True
@@ -977,6 +977,7 @@ async def benchmark(
977
977
  profile: bool,
978
978
  pd_seperated: bool = False,
979
979
  flush_cache: bool = False,
980
+ warmup_requests: int = 1,
980
981
  ):
981
982
  if backend in ASYNC_REQUEST_FUNCS:
982
983
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -994,11 +995,11 @@ async def benchmark(
994
995
  return await request_func(request_func_input=request_func_input, pbar=pbar)
995
996
 
996
997
  # Warmup
997
- print(f"Starting warmup with {args.warmup_requests} sequences...")
998
+ print(f"Starting warmup with {warmup_requests} sequences...")
998
999
 
999
1000
  # Use the first request for all warmup iterations
1000
1001
  test_prompt, test_prompt_len, test_output_len = input_requests[0]
1001
- if lora_names != None and len(lora_names) != 0:
1002
+ if lora_names is not None and len(lora_names) != 0:
1002
1003
  lora_name = lora_names[0]
1003
1004
  else:
1004
1005
  lora_name = None
@@ -1016,7 +1017,7 @@ async def benchmark(
1016
1017
 
1017
1018
  # Run warmup requests
1018
1019
  warmup_tasks = []
1019
- for _ in range(args.warmup_requests):
1020
+ for _ in range(warmup_requests):
1020
1021
  warmup_tasks.append(
1021
1022
  asyncio.create_task(request_func(request_func_input=test_input))
1022
1023
  )
@@ -1024,9 +1025,7 @@ async def benchmark(
1024
1025
  warmup_outputs = await asyncio.gather(*warmup_tasks)
1025
1026
 
1026
1027
  # Check if at least one warmup request succeeded
1027
- if args.warmup_requests > 0 and not any(
1028
- output.success for output in warmup_outputs
1029
- ):
1028
+ if warmup_requests > 0 and not any(output.success for output in warmup_outputs):
1030
1029
  raise ValueError(
1031
1030
  "Warmup failed - Please make sure benchmark arguments "
1032
1031
  f"are correctly specified. Error: {warmup_outputs[0].error}"
@@ -1058,7 +1057,7 @@ async def benchmark(
1058
1057
  tasks: List[asyncio.Task] = []
1059
1058
  async for request in get_request(input_requests, request_rate):
1060
1059
  prompt, prompt_len, output_len = request
1061
- if lora_names != None and len(lora_names) != 0:
1060
+ if lora_names is not None and len(lora_names) != 0:
1062
1061
  idx = random.randint(0, len(lora_names) - 1)
1063
1062
  lora_name = lora_names[idx]
1064
1063
  else:
@@ -27,7 +27,11 @@ from sglang.srt.warmup import warmup
27
27
  multiprocessing.set_start_method("spawn", force=True)
28
28
 
29
29
  # Reduce warning
30
- os.environ["SGL_IN_DEEP_GEMM_PRE_COMPILE_STAGE"] = "1"
30
+ os.environ["SGL_IN_DEEPGEMM_PRECOMPILE_STAGE"] = "1"
31
+ # Force enable deep gemm
32
+ os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "1"
33
+ # Force enable mha chunked kv for DeepSeek V3 to avoid missing kv_b_proj DeepGEMM case
34
+ os.environ["SGL_CHUNKED_PREFIX_CACHE_THRESHOLD"] = "0"
31
35
 
32
36
 
33
37
  @dataclasses.dataclass
@@ -84,8 +88,36 @@ def launch_server_process_and_send_one_request(
84
88
  headers = {
85
89
  "Content-Type": "application/json; charset=utf-8",
86
90
  }
87
- response = requests.get(f"{base_url}/v1/models", headers=headers)
91
+ if server_args.node_rank == 0:
92
+ response = requests.get(f"{base_url}/v1/models", headers=headers)
93
+ else:
94
+ # This http api is created by launch_dummy_health_check_server for none-rank0 node.
95
+ response = requests.get(f"{base_url}/health", headers=headers)
88
96
  if response.status_code == 200:
97
+ # Rank-0 node send a request to sync with other node and then return.
98
+ if server_args.node_rank == 0:
99
+ response = requests.post(
100
+ f"{base_url}/generate",
101
+ json={
102
+ "input_ids": [0, 1, 2, 3],
103
+ "sampling_params": {
104
+ "max_new_tokens": 8,
105
+ "temperature": 0,
106
+ },
107
+ },
108
+ timeout=600,
109
+ )
110
+ if response.status_code != 200:
111
+ error = response.json()
112
+ raise RuntimeError(f"Sync request failed: {error}")
113
+ # Other nodes should wait for the exit signal from Rank-0 node.
114
+ else:
115
+ start_time_waiting = time.time()
116
+ while proc.is_alive():
117
+ if time.time() - start_time_waiting < timeout:
118
+ time.sleep(10)
119
+ else:
120
+ raise TimeoutError("Waiting for main node timeout!")
89
121
  return proc
90
122
  except requests.RequestException:
91
123
  pass
@@ -118,10 +150,19 @@ def run_compile(server_args: ServerArgs, compile_args: CompileArgs):
118
150
 
119
151
  proc = launch_server_process_and_send_one_request(server_args, compile_args)
120
152
 
121
- kill_process_tree(proc.pid)
122
-
123
153
  print("\nDeepGEMM Kernels compilation finished successfully.")
124
154
 
155
+ # Sleep for safety
156
+ time.sleep(10)
157
+ if proc.is_alive():
158
+ # This is the rank0 node.
159
+ kill_process_tree(proc.pid)
160
+ else:
161
+ try:
162
+ kill_process_tree(proc.pid)
163
+ except Exception:
164
+ pass
165
+
125
166
 
126
167
  if __name__ == "__main__":
127
168
  parser = argparse.ArgumentParser()
@@ -113,7 +113,7 @@ def completion_template_exists(template_name: str) -> bool:
113
113
 
114
114
  def is_completion_template_defined() -> bool:
115
115
  global completion_template_name
116
- return completion_template_name != None
116
+ return completion_template_name is not None
117
117
 
118
118
 
119
119
  def generate_completion_prompt_from_request(request: ChatCompletionRequest) -> str:
@@ -182,7 +182,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
182
182
  tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
183
183
  messages,
184
184
  pil_images[image_index : image_index + image_token_cnt],
185
- bos=False,
185
+ bos=True,
186
186
  eos=True,
187
187
  cropping=len(pil_images) <= 2,
188
188
  max_req_input_len=max_req_input_len,
@@ -73,10 +73,14 @@ class ModelConfig:
73
73
  )
74
74
 
75
75
  if enable_multimodal is None:
76
- if self.hf_config.architectures[0] == "Llama4ForConditionalGeneration":
76
+ mm_disabled_models = [
77
+ "Gemma3ForConditionalGeneration",
78
+ "Llama4ForConditionalGeneration",
79
+ ]
80
+ if self.hf_config.architectures[0] in mm_disabled_models:
77
81
  enable_multimodal = False
78
82
  logger.info(
79
- "Multimodal is disabled for Llama4. To enable it, set --enable-llama4-multimodal."
83
+ f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
80
84
  )
81
85
  else:
82
86
  enable_multimodal = True
@@ -158,7 +162,9 @@ class ModelConfig:
158
162
  self.attention_arch = AttentionArch.MLA
159
163
  self.kv_lora_rank = self.hf_config.kv_lora_rank
160
164
  self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
161
- elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures:
165
+ elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr(
166
+ self.hf_text_config, "use_mla", True
167
+ ):
162
168
  self.head_dim = 256
163
169
  self.attention_arch = AttentionArch.MLA
164
170
  self.kv_lora_rank = self.hf_text_config.kv_lora_rank
@@ -0,0 +1,169 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ """Constrained decoding with llguidance backend."""
15
+
16
+ import json
17
+ import logging
18
+ import os
19
+ from typing import List, Optional, Tuple
20
+
21
+ import torch
22
+ from llguidance import LLMatcher, LLTokenizer, StructTag, grammar_from
23
+ from llguidance.hf import from_tokenizer
24
+ from llguidance.torch import (
25
+ allocate_token_bitmask,
26
+ apply_token_bitmask_inplace,
27
+ fill_next_token_bitmask,
28
+ )
29
+
30
+ from sglang.srt.constrained.base_grammar_backend import (
31
+ BaseGrammarBackend,
32
+ BaseGrammarObject,
33
+ )
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class GuidanceGrammar(BaseGrammarObject):
39
+
40
+ def __init__(self, llguidance_tokenizer: LLTokenizer, serialized_grammar: str):
41
+ super().__init__()
42
+ self.llguidance_tokenizer = llguidance_tokenizer
43
+ self.serialized_grammar = serialized_grammar
44
+
45
+ self.ll_matcher = LLMatcher(
46
+ self.llguidance_tokenizer,
47
+ self.serialized_grammar,
48
+ log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
49
+ )
50
+ self.finished = False
51
+ self.bitmask = None
52
+
53
+ def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
54
+ ff_tokens = self.ll_matcher.compute_ff_tokens()
55
+ if ff_tokens:
56
+ return ff_tokens, ""
57
+ else:
58
+ return None
59
+
60
+ def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
61
+ return "", -1
62
+
63
+ def jump_and_retokenize(
64
+ self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
65
+ ):
66
+ pass
67
+
68
+ def accept_token(self, token: int):
69
+ if not self.ll_matcher.consume_token(token):
70
+ logger.warning(f"matcher error: {self.ll_matcher.get_error()}")
71
+ self.finished = True
72
+
73
+ def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
74
+ if self.ll_matcher.is_stopped():
75
+ self.finished = True
76
+
77
+ fill_next_token_bitmask(self.ll_matcher, vocab_mask, idx)
78
+
79
+ def allocate_vocab_mask(
80
+ self, vocab_size: int, batch_size: int, device
81
+ ) -> torch.Tensor:
82
+ if self.bitmask is None or self.bitmask.shape[0] < batch_size:
83
+ # only create bitmask when batch gets larger
84
+ self.bitmask = allocate_token_bitmask(
85
+ batch_size, self.llguidance_tokenizer.vocab_size
86
+ )
87
+ bitmask = self.bitmask
88
+ else:
89
+ bitmask = self.bitmask[:batch_size]
90
+
91
+ return bitmask
92
+
93
+ @staticmethod
94
+ def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
95
+ return vocab_mask.to(device, non_blocking=True)
96
+
97
+ @staticmethod
98
+ def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
99
+ apply_token_bitmask_inplace(logits, vocab_mask)
100
+
101
+ def copy(self):
102
+ return GuidanceGrammar(
103
+ llguidance_tokenizer=self.llguidance_tokenizer,
104
+ serialized_grammar=self.serialized_grammar,
105
+ )
106
+
107
+
108
+ class GuidanceBackend(BaseGrammarBackend):
109
+
110
+ def __init__(
111
+ self,
112
+ tokenizer,
113
+ whitespace_pattern: Optional[str] = None,
114
+ n_vocab: Optional[int] = None,
115
+ ):
116
+ super().__init__()
117
+
118
+ self.tokenizer = tokenizer
119
+ self.whitespace_pattern = whitespace_pattern
120
+ self.llguidance_tokenizer = from_tokenizer(self.tokenizer, n_vocab)
121
+
122
+ def _from_serialized(self, serialized_grammar) -> Optional[GuidanceGrammar]:
123
+ try:
124
+ return GuidanceGrammar(
125
+ llguidance_tokenizer=self.llguidance_tokenizer,
126
+ serialized_grammar=serialized_grammar,
127
+ )
128
+ except Exception as e:
129
+ logger.warning(f"Skip invalid grammar: {serialized_grammar}, {e=}")
130
+ return None
131
+
132
+ def dispatch_json(self, key_string: str) -> Optional[GuidanceGrammar]:
133
+ serialized_grammar = LLMatcher.grammar_from_json_schema(
134
+ key_string,
135
+ defaults={
136
+ "whitespace_pattern": self.whitespace_pattern,
137
+ },
138
+ )
139
+ return self._from_serialized(serialized_grammar)
140
+
141
+ def dispatch_regex(self, key_string: str) -> Optional[GuidanceGrammar]:
142
+ serialized_grammar = grammar_from("regex", key_string)
143
+ return self._from_serialized(serialized_grammar)
144
+
145
+ def dispatch_ebnf(self, key_string: str) -> Optional[GuidanceGrammar]:
146
+ try:
147
+ serialized_grammar = grammar_from("ebnf", key_string)
148
+ return self._from_serialized(serialized_grammar)
149
+ except ValueError as e:
150
+ logger.warning(f"Skip invalid ebnf: regex={key_string}, {e=}")
151
+ return None
152
+
153
+ def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
154
+ try:
155
+ structural_tag = json.loads(key_string)
156
+ tags = [
157
+ StructTag(
158
+ begin=structure["begin"],
159
+ grammar=structure["schema"],
160
+ end=structure["end"],
161
+ trigger=structural_tag["triggers"][0], # TODO?
162
+ )
163
+ for structure in structural_tag["structures"]
164
+ ]
165
+ g = StructTag.to_grammar(tags)
166
+ return self._from_serialized(g)
167
+ except Exception as e:
168
+ logging.warning(f"Skip invalid structural_tag: {key_string}, {e=}")
169
+ return None
@@ -463,6 +463,30 @@ def generate_embedding_convs(
463
463
  return convs
464
464
 
465
465
 
466
+ # Models in which system adds modality tokens at prompt start automatically
467
+ # when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 <image> tokens)
468
+ _MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"}
469
+
470
+
471
+ # adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856
472
+ def _get_full_multimodal_text_prompt(
473
+ modality_token: str, modality_count: int, text_prompt: str
474
+ ) -> str:
475
+ """Combine multimodal prompts for a multimodal language model."""
476
+
477
+ # For any existing placeholder in the text prompt, we leave it as is
478
+ left: int = modality_count - text_prompt.count(modality_token)
479
+ if left < 0:
480
+ raise ValueError(
481
+ f"Found more '{modality_token}' placeholders in input prompt than "
482
+ "actual multimodal data items."
483
+ )
484
+
485
+ # NOTE: For now we always add missing modality_token at the front of
486
+ # the prompt. This may change to be customizable in the future.
487
+ return "\n".join([modality_token] * left + [text_prompt])
488
+
489
+
466
490
  def generate_chat_conv(
467
491
  request: ChatCompletionRequest, template_name: str
468
492
  ) -> Conversation:
@@ -520,6 +544,12 @@ def generate_chat_conv(
520
544
  if conv.name != "qwen2-vl"
521
545
  else conv.image_token
522
546
  )
547
+ add_token_as_needed: bool = (
548
+ conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
549
+ )
550
+ if add_token_as_needed:
551
+ image_token = ""
552
+
523
553
  audio_token = conv.audio_token
524
554
  for content in message.content:
525
555
  if content.type == "text":
@@ -533,7 +563,10 @@ def generate_chat_conv(
533
563
  elif content.type == "audio_url":
534
564
  real_content += audio_token
535
565
  conv.append_audio(content.audio_url.url)
536
-
566
+ if add_token_as_needed:
567
+ real_content = _get_full_multimodal_text_prompt(
568
+ conv.image_token, num_image_url, real_content
569
+ )
537
570
  conv.append_message(conv.roles[0], real_content)
538
571
  elif msg_role == "assistant":
539
572
  parsed_content = ""