sglang 0.4.4.post4__tar.gz → 0.4.5.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. {sglang-0.4.4.post4/sglang.egg-info → sglang-0.4.5.post1}/PKG-INFO +14 -4
  2. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/README.md +1 -1
  3. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/pyproject.toml +15 -3
  4. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/bench_one_batch.py +21 -0
  5. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/bench_serving.py +10 -4
  6. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/chat_template.py +24 -0
  7. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/model_config.py +40 -4
  8. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/constrained/base_grammar_backend.py +26 -5
  9. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/constrained/llguidance_backend.py +1 -0
  10. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/constrained/outlines_backend.py +1 -0
  11. sglang-0.4.5.post1/sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
  12. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/constrained/xgrammar_backend.py +1 -0
  13. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/conversation.py +29 -4
  14. sglang-0.4.5.post1/sglang/srt/disaggregation/base/__init__.py +8 -0
  15. sglang-0.4.5.post1/sglang/srt/disaggregation/base/conn.py +113 -0
  16. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/disaggregation/decode.py +18 -5
  17. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/disaggregation/mini_lb.py +53 -122
  18. sglang-0.4.5.post1/sglang/srt/disaggregation/mooncake/__init__.py +6 -0
  19. sglang-0.4.5.post1/sglang/srt/disaggregation/mooncake/conn.py +615 -0
  20. sglang-0.4.5.post1/sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
  21. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/disaggregation/prefill.py +43 -19
  22. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/disaggregation/utils.py +31 -0
  23. sglang-0.4.5.post1/sglang/srt/entrypoints/EngineBase.py +53 -0
  24. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/entrypoints/engine.py +36 -8
  25. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/entrypoints/http_server.py +37 -8
  26. sglang-0.4.5.post1/sglang/srt/entrypoints/http_server_engine.py +142 -0
  27. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/entrypoints/verl_engine.py +37 -10
  28. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/hf_transformers_utils.py +4 -0
  29. sglang-0.4.5.post1/sglang/srt/layers/attention/flashattention_backend.py +1159 -0
  30. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashinfer_backend.py +13 -7
  31. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/vision.py +1 -1
  32. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/dp_attention.py +2 -4
  33. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/elementwise.py +15 -2
  34. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/linear.py +1 -0
  35. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
  36. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_native.py +5 -0
  37. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  38. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  39. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  40. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  41. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  42. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  43. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  44. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  46. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  48. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  49. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  50. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +51 -24
  51. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  52. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/router.py +7 -1
  53. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/topk.py +37 -16
  54. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/__init__.py +13 -5
  55. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  56. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
  57. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
  58. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/fp8.py +28 -14
  59. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/fp8_kernel.py +130 -4
  60. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/fp8_utils.py +34 -6
  61. sglang-0.4.5.post1/sglang/srt/layers/quantization/kv_cache.py +89 -0
  62. sglang-0.4.5.post1/sglang/srt/layers/quantization/modelopt_quant.py +463 -0
  63. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/moe_wna16.py +2 -0
  64. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
  65. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  66. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/radix_attention.py +14 -0
  67. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/rotary_embedding.py +75 -1
  68. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/io_struct.py +254 -97
  69. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/mm_utils.py +3 -2
  70. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
  71. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
  72. sglang-0.4.5.post1/sglang/srt/managers/multimodal_processors/mllama4.py +146 -0
  73. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/schedule_batch.py +62 -21
  74. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/scheduler.py +71 -14
  75. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/tokenizer_manager.py +17 -3
  76. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/tp_worker.py +1 -0
  77. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/mem_cache/memory_pool.py +14 -1
  78. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/metrics/collector.py +9 -0
  79. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/model_executor/cuda_graph_runner.py +7 -4
  80. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/model_executor/forward_batch_info.py +234 -15
  81. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/model_executor/model_runner.py +49 -9
  82. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/model_loader/loader.py +31 -4
  83. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/model_loader/weight_utils.py +4 -2
  84. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/baichuan.py +2 -0
  85. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/chatglm.py +1 -0
  86. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/commandr.py +1 -0
  87. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/dbrx.py +1 -0
  88. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/deepseek.py +1 -0
  89. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_v2.py +248 -61
  90. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/exaone.py +1 -0
  91. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/gemma.py +1 -0
  92. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/gemma2.py +1 -0
  93. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/gemma3_causal.py +1 -0
  94. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/gpt2.py +1 -0
  95. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/gpt_bigcode.py +1 -0
  96. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/granite.py +1 -0
  97. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/grok.py +1 -0
  98. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/internlm2.py +1 -0
  99. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/llama.py +13 -4
  100. sglang-0.4.5.post1/sglang/srt/models/llama4.py +487 -0
  101. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/minicpm.py +1 -0
  102. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/minicpm3.py +2 -0
  103. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/mixtral.py +1 -0
  104. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/mixtral_quant.py +1 -0
  105. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/mllama.py +51 -8
  106. sglang-0.4.5.post1/sglang/srt/models/mllama4.py +227 -0
  107. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/olmo.py +1 -0
  108. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/olmo2.py +1 -0
  109. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/olmoe.py +1 -0
  110. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/phi3_small.py +1 -0
  111. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/qwen.py +1 -0
  112. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/qwen2.py +1 -0
  113. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_5_vl.py +35 -70
  114. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_moe.py +1 -0
  115. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_vl.py +27 -25
  116. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/stablelm.py +1 -0
  117. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/xverse.py +1 -0
  118. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/xverse_moe.py +1 -0
  119. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/openai_api/adapter.py +4 -1
  120. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/patch_torch.py +11 -0
  121. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/server_args.py +34 -0
  122. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
  123. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/speculative/eagle_utils.py +1 -11
  124. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/speculative/eagle_worker.py +6 -2
  125. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/utils.py +120 -9
  126. sglang-0.4.5.post1/sglang/test/attention/test_flashattn_backend.py +350 -0
  127. sglang-0.4.5.post1/sglang/test/attention/test_flashattn_mla_backend.py +285 -0
  128. sglang-0.4.5.post1/sglang/test/attention/test_prefix_chunk_info.py +224 -0
  129. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/test_block_fp8.py +57 -0
  130. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/test_utils.py +19 -8
  131. sglang-0.4.5.post1/sglang/version.py +1 -0
  132. {sglang-0.4.4.post4 → sglang-0.4.5.post1/sglang.egg-info}/PKG-INFO +14 -4
  133. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang.egg-info/SOURCES.txt +27 -3
  134. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang.egg-info/requires.txt +13 -2
  135. sglang-0.4.4.post4/sglang/srt/disaggregation/conn.py +0 -81
  136. sglang-0.4.4.post4/sglang/srt/layers/attention/flashattention_backend.py +0 -752
  137. sglang-0.4.4.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -146
  138. sglang-0.4.4.post4/sglang/srt/layers/quantization/kv_cache.py +0 -98
  139. sglang-0.4.4.post4/sglang/srt/layers/quantization/modelopt_quant.py +0 -196
  140. sglang-0.4.4.post4/sglang/test/attention/test_flashattn_backend.py +0 -312
  141. sglang-0.4.4.post4/sglang/version.py +0 -1
  142. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/LICENSE +0 -0
  143. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/setup.cfg +0 -0
  144. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/__init__.py +0 -0
  145. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/api.py +0 -0
  146. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/bench_offline_throughput.py +0 -0
  147. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/bench_one_batch_server.py +0 -0
  148. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/check_env.py +0 -0
  149. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/global_config.py +0 -0
  150. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/__init__.py +0 -0
  151. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/backend/__init__.py +0 -0
  152. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/backend/anthropic.py +0 -0
  153. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/backend/base_backend.py +0 -0
  154. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/backend/litellm.py +0 -0
  155. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/backend/openai.py +0 -0
  156. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  157. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/backend/vertexai.py +0 -0
  158. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/choices.py +0 -0
  159. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/compiler.py +0 -0
  160. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/interpreter.py +0 -0
  161. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/ir.py +0 -0
  162. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/lang/tracer.py +0 -0
  163. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/launch_server.py +0 -0
  164. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/llama3_eval.py +0 -0
  165. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/_custom_ops.py +0 -0
  166. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/aio_rwlock.py +0 -0
  167. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/code_completion_parser.py +0 -0
  168. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/__init__.py +0 -0
  169. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/chatglm.py +0 -0
  170. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/dbrx.py +0 -0
  171. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/deepseekvl2.py +0 -0
  172. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/device_config.py +0 -0
  173. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/exaone.py +0 -0
  174. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/janus_pro.py +0 -0
  175. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/load_config.py +0 -0
  176. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/configs/utils.py +0 -0
  177. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/connector/__init__.py +0 -0
  178. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/connector/base_connector.py +0 -0
  179. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/connector/redis.py +0 -0
  180. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/connector/s3.py +0 -0
  181. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/connector/serde/__init__.py +0 -0
  182. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/connector/serde/safe_serde.py +0 -0
  183. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/connector/serde/serde.py +0 -0
  184. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/connector/utils.py +0 -0
  185. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  186. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/custom_op.py +0 -0
  187. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/__init__.py +0 -0
  188. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/communication_op.py +0 -0
  189. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  190. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  191. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  192. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  193. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  194. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  195. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  196. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  197. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/parallel_state.py +0 -0
  198. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/distributed/utils.py +0 -0
  199. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/function_call_parser.py +0 -0
  200. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/activation.py +0 -0
  201. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  202. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  203. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  204. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
  205. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  206. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  207. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  208. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  209. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  210. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  211. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  212. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/attention/utils.py +0 -0
  213. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/layernorm.py +0 -0
  214. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/logits_processor.py +0 -0
  215. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  216. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  217. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  218. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  219. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  220. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  221. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  222. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  223. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  224. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  225. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  226. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  227. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  228. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  229. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  230. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  231. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  232. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  233. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  234. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  235. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  236. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  237. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  238. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  239. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  240. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  241. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  242. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  243. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  244. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  245. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  246. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  247. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  248. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  249. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  250. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  251. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  252. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  253. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  254. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  255. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  256. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  257. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  258. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  259. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  260. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  261. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  262. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  263. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  264. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  265. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  266. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  267. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  268. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  269. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  270. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  271. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  272. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  273. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  274. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  275. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  276. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  277. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  278. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  279. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  280. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  281. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  282. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  283. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  284. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  285. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  286. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  287. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  288. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  289. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  290. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  291. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  292. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  293. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  294. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  295. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  296. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  297. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  298. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  299. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  300. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  301. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  302. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  303. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  304. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  305. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  306. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  307. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  308. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  309. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  310. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  311. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  312. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  313. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  314. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  315. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  316. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  317. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  318. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  319. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  320. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  321. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  322. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  323. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  324. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  325. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  326. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  327. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  328. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  329. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  330. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  331. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  332. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  333. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  334. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  335. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  336. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  337. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  338. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  339. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  340. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/parameter.py +0 -0
  341. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/pooler.py +0 -0
  342. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/awq.py +0 -0
  343. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  344. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  345. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  346. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  347. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  348. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  349. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  350. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  351. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  352. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  353. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  354. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  355. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  357. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  358. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  364. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  368. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  372. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  488. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  489. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  490. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  491. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  492. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  493. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  494. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  495. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  496. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  497. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  498. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  499. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  500. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  501. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/gptq.py +0 -0
  502. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  503. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  504. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/utils.py +0 -0
  505. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/sampler.py +0 -0
  506. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  507. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  508. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/backend/__init__.py +0 -0
  509. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/backend/base_backend.py +0 -0
  510. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  511. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/backend/triton_backend.py +0 -0
  512. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/layers.py +0 -0
  513. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/lora.py +0 -0
  514. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/lora_config.py +0 -0
  515. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/lora_manager.py +0 -0
  516. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/mem_pool.py +0 -0
  517. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  518. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  519. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  520. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  521. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  522. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/lora/utils.py +0 -0
  523. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/cache_controller.py +0 -0
  524. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/configure_logging.py +0 -0
  525. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  526. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  527. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/expert_distribution.py +0 -0
  528. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processor.py +0 -0
  529. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
  530. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +0 -0
  531. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/gemma3.py +0 -0
  532. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
  533. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/minicpm.py +0 -0
  534. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
  535. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/qwen_vl.py +0 -0
  536. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  537. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
  538. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/session_controller.py +0 -0
  539. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  540. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/managers/utils.py +0 -0
  541. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  542. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  543. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  544. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
  545. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/mem_cache/paged_allocator.py +0 -0
  546. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  547. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/metrics/func_timer.py +0 -0
  548. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/mm_utils.py +0 -0
  549. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/model_loader/__init__.py +0 -0
  550. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/model_loader/utils.py +0 -0
  551. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/model_parallel.py +0 -0
  552. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/clip.py +0 -0
  553. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  554. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_nextn.py +0 -0
  555. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_vl2.py +0 -0
  556. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  557. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/gemma3_mm.py +0 -0
  558. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  559. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/llama_classification.py +0 -0
  560. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/llama_eagle.py +0 -0
  561. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/llama_eagle3.py +0 -0
  562. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/llama_embedding.py +0 -0
  563. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/llama_reward.py +0 -0
  564. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/llava.py +0 -0
  565. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/llavavid.py +0 -0
  566. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/minicpmo.py +0 -0
  567. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/minicpmv.py +0 -0
  568. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/mistral.py +0 -0
  569. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_classification.py +0 -0
  570. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
  571. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_rm.py +0 -0
  572. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/registry.py +0 -0
  573. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  574. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/models/yivl.py +0 -0
  575. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/openai_api/protocol.py +0 -0
  576. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/platforms/interface.py +0 -0
  577. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/reasoning_parser.py +0 -0
  578. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  579. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  580. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  581. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  582. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  583. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  584. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  585. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  586. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/server.py +0 -0
  587. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  588. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/speculative/spec_info.py +0 -0
  589. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  590. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/srt/warmup.py +0 -0
  591. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/__init__.py +0 -0
  592. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/attention/__init__.py +0 -0
  593. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  594. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  595. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/run_eval.py +0 -0
  596. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/runners.py +0 -0
  597. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/send_one.py +0 -0
  598. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/simple_eval_common.py +0 -0
  599. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  600. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  601. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/simple_eval_math.py +0 -0
  602. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  603. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  604. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/test_activation.py +0 -0
  605. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/test_block_fp8_ep.py +0 -0
  606. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/test_custom_ops.py +0 -0
  607. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/test_dynamic_grad_mode.py +0 -0
  608. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/test_layernorm.py +0 -0
  609. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/test/test_programs.py +0 -0
  610. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang/utils.py +0 -0
  611. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang.egg-info/dependency_links.txt +0 -0
  612. {sglang-0.4.4.post4 → sglang-0.4.5.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.4.post4
3
+ Version: 0.4.5.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,20 +239,30 @@ Requires-Dist: python-multipart; extra == "runtime-common"
239
239
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
240
240
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
241
241
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
242
- Requires-Dist: transformers==4.51.0; extra == "runtime-common"
242
+ Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
245
  Requires-Dist: compressed-tensors; extra == "runtime-common"
246
246
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
247
247
  Provides-Extra: srt
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt"
249
- Requires-Dist: sgl-kernel==0.0.8; extra == "srt"
249
+ Requires-Dist: sgl-kernel==0.0.9.post1; extra == "srt"
250
250
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
251
251
  Requires-Dist: torch==2.5.1; extra == "srt"
252
+ Requires-Dist: torchvision==0.20.1; extra == "srt"
252
253
  Requires-Dist: cuda-python; extra == "srt"
253
254
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
254
255
  Requires-Dist: partial_json_parser; extra == "srt"
255
256
  Requires-Dist: einops; extra == "srt"
257
+ Provides-Extra: blackwell
258
+ Requires-Dist: sglang[runtime_common]; extra == "blackwell"
259
+ Requires-Dist: sgl-kernel; extra == "blackwell"
260
+ Requires-Dist: torch; extra == "blackwell"
261
+ Requires-Dist: torchvision; extra == "blackwell"
262
+ Requires-Dist: cuda-python; extra == "blackwell"
263
+ Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
264
+ Requires-Dist: partial_json_parser; extra == "blackwell"
265
+ Requires-Dist: einops; extra == "blackwell"
256
266
  Provides-Extra: srt-hip
257
267
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
258
268
  Requires-Dist: torch; extra == "srt-hip"
@@ -391,7 +401,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
391
401
 
392
402
  ## Adoption and Sponsorship
393
403
  The project has been deployed to large-scale production, generating trillions of tokens every day.
394
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
404
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
395
405
 
396
406
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
397
407
 
@@ -63,7 +63,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
63
63
 
64
64
  ## Adoption and Sponsorship
65
65
  The project has been deployed to large-scale production, generating trillions of tokens every day.
66
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
66
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
67
67
 
68
68
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
69
69
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.4.post4"
7
+ version = "0.4.5.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -38,7 +38,7 @@ runtime_common = [
38
38
  "pyzmq>=25.1.2",
39
39
  "soundfile==0.13.1",
40
40
  "torchao>=0.7.0",
41
- "transformers==4.51.0",
41
+ "transformers==4.51.1",
42
42
  "uvicorn",
43
43
  "uvloop",
44
44
  "compressed-tensors",
@@ -47,9 +47,21 @@ runtime_common = [
47
47
 
48
48
  srt = [
49
49
  "sglang[runtime_common]",
50
- "sgl-kernel==0.0.8",
50
+ "sgl-kernel==0.0.9.post1",
51
51
  "flashinfer_python==0.2.3",
52
52
  "torch==2.5.1",
53
+ "torchvision==0.20.1",
54
+ "cuda-python",
55
+ "outlines>=0.0.44,<=0.1.11",
56
+ "partial_json_parser",
57
+ "einops",
58
+ ]
59
+
60
+ blackwell = [
61
+ "sglang[runtime_common]",
62
+ "sgl-kernel",
63
+ "torch",
64
+ "torchvision",
53
65
  "cuda-python",
54
66
  "outlines>=0.0.44,<=0.1.11",
55
67
  "partial_json_parser",
@@ -60,6 +60,7 @@ from sglang.srt.configs.model_config import ModelConfig
60
60
  from sglang.srt.entrypoints.engine import _set_envs_and_config
61
61
  from sglang.srt.hf_transformers_utils import get_tokenizer
62
62
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
63
+ from sglang.srt.managers.scheduler import Scheduler
63
64
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
64
65
  from sglang.srt.model_executor.model_runner import ModelRunner
65
66
  from sglang.srt.sampling.sampling_params import SamplingParams
@@ -135,6 +136,7 @@ def load_model(server_args, port_args, tp_rank):
135
136
  context_length=server_args.context_length,
136
137
  model_override_args=server_args.json_model_override_args,
137
138
  is_embedding=server_args.is_embedding,
139
+ enable_multimodal=server_args.enable_multimodal,
138
140
  dtype=server_args.dtype,
139
141
  quantization=server_args.quantization,
140
142
  )
@@ -184,6 +186,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
184
186
  req.prefix_indices = []
185
187
  req.fill_ids = req.origin_input_ids
186
188
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
189
+ req.logprob_start_len = len(req.origin_input_ids) - 1
187
190
  reqs.append(req)
188
191
 
189
192
  return input_ids, reqs
@@ -199,6 +202,7 @@ def prepare_extend_inputs_for_correctness_test(
199
202
  i, : bench_args.cut_len
200
203
  ]
201
204
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
205
+ req.logprob_start_len = len(req.origin_input_ids) - 1
202
206
  return reqs
203
207
 
204
208
 
@@ -220,6 +224,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
220
224
  req.prefix_indices = []
221
225
  req.fill_ids = req.origin_input_ids
222
226
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
227
+ req.logprob_start_len = len(req.origin_input_ids) - 1
223
228
  reqs.append(req)
224
229
 
225
230
  return reqs
@@ -238,6 +243,7 @@ def extend(reqs, model_runner):
238
243
  enable_custom_logit_processor=False,
239
244
  )
240
245
  batch.prepare_for_extend()
246
+ _maybe_prepare_dp_attn_batch(batch, model_runner)
241
247
  model_worker_batch = batch.get_model_worker_batch()
242
248
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
243
249
  logits_output = model_runner.forward(forward_batch)
@@ -249,6 +255,7 @@ def extend(reqs, model_runner):
249
255
  def decode(input_token_ids, batch, model_runner):
250
256
  batch.output_ids = input_token_ids
251
257
  batch.prepare_for_decode()
258
+ _maybe_prepare_dp_attn_batch(batch, model_runner)
252
259
  model_worker_batch = batch.get_model_worker_batch()
253
260
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
254
261
  logits_output = model_runner.forward(forward_batch)
@@ -256,6 +263,20 @@ def decode(input_token_ids, batch, model_runner):
256
263
  return next_token_ids, logits_output.next_token_logits
257
264
 
258
265
 
266
+ def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
267
+ if model_runner.server_args.enable_dp_attention:
268
+ Scheduler.prepare_dp_attn_batch_raw(
269
+ batch,
270
+ dp_size=model_runner.server_args.dp_size,
271
+ attn_tp_size=1,
272
+ tp_cpu_group=model_runner.tp_group.cpu_group,
273
+ get_idle_batch=None,
274
+ disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
275
+ spec_algorithm=SpeculativeAlgorithm.NONE,
276
+ speculative_num_draft_tokens=None,
277
+ )
278
+
279
+
259
280
  def correctness_test(
260
281
  server_args,
261
282
  port_args,
@@ -490,7 +490,7 @@ def get_dataset(args, tokenizer):
490
490
  prompt_suffix=args.prompt_suffix,
491
491
  apply_chat_template=args.apply_chat_template,
492
492
  )
493
- elif args.dataset_name == "random":
493
+ elif args.dataset_name.startswith("random"):
494
494
  input_requests = sample_random_requests(
495
495
  input_len=args.random_input_len,
496
496
  output_len=args.random_output_len,
@@ -498,6 +498,7 @@ def get_dataset(args, tokenizer):
498
498
  range_ratio=args.random_range_ratio,
499
499
  tokenizer=tokenizer,
500
500
  dataset_path=args.dataset_path,
501
+ random_sample=args.dataset_name == "random",
501
502
  )
502
503
  elif args.dataset_name == "generated-shared-prefix":
503
504
  input_requests = sample_generated_shared_prefix_requests(
@@ -687,6 +688,7 @@ def sample_random_requests(
687
688
  range_ratio: float,
688
689
  tokenizer: PreTrainedTokenizerBase,
689
690
  dataset_path: str,
691
+ random_sample: bool = True,
690
692
  ) -> List[Tuple[str, int, int]]:
691
693
 
692
694
  input_lens = np.random.randint(
@@ -700,11 +702,15 @@ def sample_random_requests(
700
702
  size=num_prompts,
701
703
  )
702
704
 
703
- if True:
705
+ if random_sample:
704
706
  # Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
705
707
 
706
708
  # Download sharegpt if necessary
707
709
  if not os.path.isfile(dataset_path):
710
+ print(
711
+ "If you do not want to randomly sample from a dataset,"
712
+ " please use --dataset-name random-ids."
713
+ )
708
714
  dataset_path = download_and_cache_file(SHAREGPT_URL)
709
715
 
710
716
  # Load the dataset.
@@ -1223,7 +1229,7 @@ async def benchmark(
1223
1229
  output_file_name = args.output_file
1224
1230
  else:
1225
1231
  now = datetime.now().strftime("%m%d")
1226
- if args.dataset_name == "random":
1232
+ if args.dataset_name.startswith("random"):
1227
1233
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
1228
1234
  else:
1229
1235
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
@@ -1442,7 +1448,7 @@ if __name__ == "__main__":
1442
1448
  "--dataset-name",
1443
1449
  type=str,
1444
1450
  default="sharegpt",
1445
- choices=["sharegpt", "random", "generated-shared-prefix"],
1451
+ choices=["sharegpt", "random", "random-ids", "generated-shared-prefix"],
1446
1452
  help="Name of the dataset to benchmark on.",
1447
1453
  )
1448
1454
  parser.add_argument(
@@ -294,6 +294,30 @@ register_chat_template(
294
294
  )
295
295
  )
296
296
 
297
+ # Reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
298
+ register_chat_template(
299
+ ChatTemplate(
300
+ name="llama-4",
301
+ default_system_prompt=None,
302
+ role_prefix_and_suffix={
303
+ "system": (
304
+ "<|header_start|>system<|header_end|>\n\n",
305
+ "<|eot|>",
306
+ ),
307
+ "user": (
308
+ "<|header_start|>user<|header_end|>\n\n",
309
+ "<|eot|>",
310
+ ),
311
+ "assistant": (
312
+ "<|header_start|>assistant<|header_end|>\n\n",
313
+ "<|eot|>",
314
+ ),
315
+ },
316
+ stop_str=("<|eot|>",),
317
+ image_token="<|image|>",
318
+ )
319
+ )
320
+
297
321
  # Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
298
322
  register_chat_template(
299
323
  ChatTemplate(
@@ -15,6 +15,7 @@
15
15
  import json
16
16
  import logging
17
17
  import math
18
+ import os
18
19
  from enum import IntEnum, auto
19
20
  from typing import List, Optional, Set, Union
20
21
 
@@ -42,10 +43,12 @@ class ModelConfig:
42
43
  context_length: Optional[int] = None,
43
44
  model_override_args: Optional[str] = None,
44
45
  is_embedding: Optional[bool] = None,
46
+ enable_multimodal: Optional[bool] = None,
45
47
  dtype: str = "auto",
46
48
  quantization: Optional[str] = None,
47
49
  override_config_file: Optional[str] = None,
48
50
  ) -> None:
51
+
49
52
  self.model_path = model_path
50
53
  self.revision = revision
51
54
  self.quantization = quantization
@@ -65,15 +68,32 @@ class ModelConfig:
65
68
  **kwargs,
66
69
  )
67
70
  self.hf_text_config = get_hf_text_config(self.hf_config)
71
+ self.attention_chunk_size = getattr(
72
+ self.hf_text_config, "attention_chunk_size", None
73
+ )
74
+
75
+ if enable_multimodal is None:
76
+ if self.hf_config.architectures == "Llama4ForConditionalGeneration":
77
+ enable_multimodal = False
78
+ else:
79
+ enable_multimodal = True
68
80
 
69
81
  # Check model type
70
82
  self.is_generation = is_generation_model(
71
83
  self.hf_config.architectures, is_embedding
72
84
  )
73
- self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
74
- self.is_multimodal_gen = is_multimodal_gen_model(self.hf_config.architectures)
75
- self.is_image_gen = is_image_gen_model(self.hf_config.architectures)
76
- self.is_audio_model = is_audio_model(self.hf_config.architectures)
85
+ self.is_multimodal = enable_multimodal and is_multimodal_model(
86
+ self.hf_config.architectures
87
+ )
88
+ self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
89
+ self.hf_config.architectures
90
+ )
91
+ self.is_image_gen = enable_multimodal and is_image_gen_model(
92
+ self.hf_config.architectures
93
+ )
94
+ self.is_audio_model = enable_multimodal and is_audio_model(
95
+ self.hf_config.architectures
96
+ )
77
97
  self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
78
98
  self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
79
99
 
@@ -231,6 +251,20 @@ class ModelConfig:
231
251
  if quant_cfg is None:
232
252
  # compressed-tensors uses a "compression_config" key
233
253
  quant_cfg = getattr(self.hf_config, "compression_config", None)
254
+ if quant_cfg is None:
255
+ # check if is modelopt model -- modelopt doesn't have corresponding field
256
+ # in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
257
+ # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
258
+ is_local = os.path.exists(self.model_path)
259
+ modelopt_quant_config = {"quant_method": "modelopt"}
260
+ if not is_local:
261
+ from huggingface_hub import HfApi
262
+
263
+ hf_api = HfApi()
264
+ if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
265
+ quant_cfg = modelopt_quant_config
266
+ elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
267
+ quant_cfg = modelopt_quant_config
234
268
  return quant_cfg
235
269
 
236
270
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -261,6 +295,7 @@ class ModelConfig:
261
295
  "moe_wna16",
262
296
  ]
263
297
  compatible_quantization_methods = {
298
+ "modelopt_fp4": ["modelopt"],
264
299
  "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
265
300
  "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
266
301
  }
@@ -468,6 +503,7 @@ multimodal_model_archs = [
468
503
  "Grok1VForCausalLM",
469
504
  "Grok1AForCausalLM",
470
505
  "LlavaLlamaForCausalLM",
506
+ "Llama4ForConditionalGeneration",
471
507
  "LlavaMistralForCausalLM",
472
508
  "LlavaQwenForCausalLM",
473
509
  "LlavaVidForCausalLM",
@@ -28,6 +28,18 @@ logger = logging.getLogger(__name__)
28
28
 
29
29
 
30
30
  class BaseGrammarObject(ABC):
31
+
32
+ def __init__(self):
33
+ self._finished = False
34
+
35
+ @property
36
+ def finished(self):
37
+ return self._finished
38
+
39
+ @finished.setter
40
+ def finished(self, finished):
41
+ self._finished = finished
42
+
31
43
  @abstractmethod
32
44
  def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
33
45
  """
@@ -59,6 +71,13 @@ class BaseGrammarObject(ABC):
59
71
  """
60
72
  raise NotImplementedError
61
73
 
74
+ @abstractmethod
75
+ def accept_token(self, token: int) -> None:
76
+ """
77
+ Accept a token in the grammar.
78
+ """
79
+ raise NotImplementedError
80
+
62
81
  @abstractmethod
63
82
  def allocate_vocab_mask(
64
83
  self, vocab_size: int, batch_size: int, device
@@ -90,7 +109,7 @@ class CacheEntry:
90
109
  event: Event
91
110
 
92
111
 
93
- class BaseGrammarBackend(ABC):
112
+ class BaseGrammarBackend:
94
113
  def __init__(self):
95
114
  self.executor = ThreadPoolExecutor()
96
115
  self.cache: Dict[Tuple[str, str], CacheEntry] = {}
@@ -107,19 +126,15 @@ class BaseGrammarBackend(ABC):
107
126
  """
108
127
  raise ValueError(f"Invalid key_type: {key_type}={key_string}")
109
128
 
110
- @abstractmethod
111
129
  def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]:
112
130
  return self._not_supported("json", key_string)
113
131
 
114
- @abstractmethod
115
132
  def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]:
116
133
  return self._not_supported("regex", key_string)
117
134
 
118
- @abstractmethod
119
135
  def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
120
136
  return self._not_supported("ebnf", key_string)
121
137
 
122
- @abstractmethod
123
138
  def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
124
139
  return self._not_supported("structural_tag", key_string)
125
140
 
@@ -195,4 +210,10 @@ def create_grammar_backend(
195
210
  else:
196
211
  raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
197
212
 
213
+ if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"):
214
+ from .reasoner_grammar_backend import ReasonerGrammarBackend
215
+
216
+ grammar_backend = ReasonerGrammarBackend(
217
+ grammar_backend, tokenizer.think_end_id
218
+ )
198
219
  return grammar_backend
@@ -33,6 +33,7 @@ class GuidanceGrammar(BaseGrammarObject):
33
33
  def __init__(
34
34
  self, llguidance_tokenizer: llguidance.LLTokenizer, serialized_grammar: str
35
35
  ):
36
+ super().__init__()
36
37
  self.llguidance_tokenizer = llguidance_tokenizer
37
38
  self.serialized_grammar = serialized_grammar
38
39
 
@@ -44,6 +44,7 @@ class OutlinesGrammar(BaseGrammarObject):
44
44
  guide: RegexGuide,
45
45
  jump_forward_map: Union[OutlinesJumpForwardMap, None],
46
46
  ) -> None:
47
+ super().__init__()
47
48
  self.guide = guide
48
49
  self.jump_forward_map = jump_forward_map
49
50
  self.state = 0
@@ -0,0 +1,101 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ """The baseclass of a backend for reasoner grammar-guided constrained decoding."""
15
+
16
+ from concurrent.futures import Future
17
+ from typing import List, Optional, Tuple
18
+
19
+ import torch
20
+
21
+ from .base_grammar_backend import BaseGrammarBackend, BaseGrammarObject
22
+
23
+
24
+ class ReasonerGrammarObject(BaseGrammarObject):
25
+ def __init__(self, grammar: BaseGrammarObject, think_end_id):
26
+ super().__init__()
27
+ self.grammar = grammar
28
+ self.think_end_id = think_end_id
29
+ self.is_in_reasoning = True
30
+
31
+ @property
32
+ def finished(self):
33
+ return self.grammar.finished
34
+
35
+ @finished.setter
36
+ def finished(self, finished):
37
+ self.grammar.finished = finished
38
+
39
+ def allocate_vocab_mask(
40
+ self, vocab_size: int, batch_size: int, device
41
+ ) -> torch.Tensor:
42
+ return self.grammar.allocate_vocab_mask(vocab_size, batch_size, device)
43
+
44
+ def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
45
+ if not self.is_in_reasoning:
46
+ self.grammar.fill_vocab_mask(vocab_mask, idx)
47
+
48
+ def move_vocab_mask(self, vocab_mask: torch.Tensor, device) -> torch.Tensor:
49
+ return self.grammar.move_vocab_mask(vocab_mask, device)
50
+
51
+ @property
52
+ def apply_vocab_mask(self):
53
+ return self.grammar.apply_vocab_mask
54
+
55
+ def accept_token(self, token: int):
56
+ if token == self.think_end_id:
57
+ self.is_in_reasoning = False
58
+
59
+ if not self.is_in_reasoning and token != self.think_end_id:
60
+ self.grammar.accept_token(token)
61
+
62
+ def try_jump_forward(self, tokenizer):
63
+ return self.grammar.try_jump_forward(tokenizer)
64
+
65
+ def jump_forward_str_state(self, helper):
66
+ return self.grammar.jump_forward_str_state(helper)
67
+
68
+ def jump_and_retokenize(
69
+ self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
70
+ ):
71
+ return self.grammar.jump_and_retokenize(
72
+ old_output_ids, new_output_ids, next_state
73
+ )
74
+
75
+ def copy(self) -> BaseGrammarObject:
76
+ return ReasonerGrammarObject(self.grammar.copy(), self.think_end_id)
77
+
78
+
79
+ class ReasonerGrammarBackend(BaseGrammarBackend):
80
+ def __init__(self, grammar_backend: BaseGrammarBackend, think_end_id):
81
+ self.grammar_backend = grammar_backend
82
+ self.think_end_id = think_end_id
83
+
84
+ def get_cached_value(self, key: Tuple[str, str]) -> Optional[ReasonerGrammarObject]:
85
+ grammar = self.grammar_backend.get_cached_value(key)
86
+ return ReasonerGrammarObject(grammar, self.think_end_id) if grammar else None
87
+
88
+ def get_future_value(self, key: Tuple[str, str]) -> Future:
89
+ grammar = Future()
90
+
91
+ def callback(f: Future):
92
+ if result := f.result():
93
+ grammar.set_result(ReasonerGrammarObject(result, self.think_end_id))
94
+ else:
95
+ grammar.set_result(None)
96
+
97
+ self.grammar_backend.get_future_value(key).add_done_callback(callback)
98
+ return grammar
99
+
100
+ def reset(self):
101
+ self.grammar_backend.reset()
@@ -48,6 +48,7 @@ class XGrammarGrammar(BaseGrammarObject):
48
48
  ctx: CompiledGrammar,
49
49
  override_stop_tokens: Optional[Union[List[int], int]],
50
50
  ) -> None:
51
+ super().__init__()
51
52
  self.matcher = matcher
52
53
  self.vocab_size = vocab_size
53
54
  self.ctx = ctx
@@ -33,6 +33,7 @@ class SeparatorStyle(IntEnum):
33
33
  ADD_NEW_LINE_SINGLE = auto()
34
34
  LLAMA2 = auto()
35
35
  LLAMA3 = auto()
36
+ LLAMA4 = auto()
36
37
  CHATGLM = auto()
37
38
  CHATML = auto()
38
39
  CHATINTERN = auto()
@@ -156,19 +157,30 @@ class Conversation:
156
157
  else:
157
158
  ret += role + ":"
158
159
  return ret
160
+ elif self.sep_style == SeparatorStyle.LLAMA4:
161
+ # begin_of_text is added by default
162
+ if self.system_message:
163
+ ret = system_prompt
164
+ else:
165
+ ret = ""
166
+ for i, (role, message) in enumerate(self.messages):
167
+ if message:
168
+ ret += f"<|header_start|>{role}<|header_end|>\n\n"
169
+ ret += f"{message.strip()}<|eot|>"
170
+ else:
171
+ ret += f"<|header_start|>{role}<|header_end|>\n\n"
172
+ return ret
159
173
  elif self.sep_style == SeparatorStyle.LLAMA3:
160
- ret = "<|begin_of_text|>"
161
174
  if self.system_message:
162
- ret += system_prompt
175
+ ret = system_prompt
163
176
  else:
164
- ret += ""
177
+ ret = ""
165
178
  for i, (role, message) in enumerate(self.messages):
166
179
  if message:
167
180
  ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
168
181
  ret += f"{message.strip()}<|eot_id|>"
169
182
  else:
170
183
  ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
171
- # print(ret)
172
184
  return ret
173
185
  elif self.sep_style == SeparatorStyle.LLAMA2:
174
186
  seps = [self.sep, self.sep2]
@@ -561,6 +573,19 @@ register_conv_template(
561
573
  )
562
574
  )
563
575
 
576
+ # reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
577
+ register_conv_template(
578
+ Conversation(
579
+ name="llama-4",
580
+ system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
581
+ roles=("user", "assistant"),
582
+ sep_style=SeparatorStyle.LLAMA4,
583
+ sep="",
584
+ stop_str=["<|end_of_text|>", "<|eot|>", "<|eom|>"],
585
+ image_token="<|image|>",
586
+ )
587
+ )
588
+
564
589
  register_conv_template(
565
590
  Conversation(
566
591
  name="chatml",
@@ -0,0 +1,8 @@
1
+ from .conn import (
2
+ BaseKVBootstrapServer,
3
+ BaseKVManager,
4
+ BaseKVReceiver,
5
+ BaseKVSender,
6
+ KVArgs,
7
+ KVPoll,
8
+ )