sglang 0.4.5__tar.gz → 0.4.5.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. {sglang-0.4.5/sglang.egg-info → sglang-0.4.5.post1}/PKG-INFO +14 -4
  2. {sglang-0.4.5 → sglang-0.4.5.post1}/README.md +1 -1
  3. {sglang-0.4.5 → sglang-0.4.5.post1}/pyproject.toml +15 -3
  4. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/bench_one_batch.py +21 -0
  5. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/bench_serving.py +10 -4
  6. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/model_config.py +37 -5
  7. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/base_grammar_backend.py +26 -5
  8. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/llguidance_backend.py +1 -0
  9. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/outlines_backend.py +1 -0
  10. sglang-0.4.5.post1/sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
  11. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/xgrammar_backend.py +1 -0
  12. sglang-0.4.5.post1/sglang/srt/disaggregation/base/__init__.py +8 -0
  13. sglang-0.4.5.post1/sglang/srt/disaggregation/base/conn.py +113 -0
  14. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/disaggregation/decode.py +18 -5
  15. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/disaggregation/mini_lb.py +53 -122
  16. sglang-0.4.5.post1/sglang/srt/disaggregation/mooncake/__init__.py +6 -0
  17. sglang-0.4.5.post1/sglang/srt/disaggregation/mooncake/conn.py +615 -0
  18. sglang-0.4.5.post1/sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
  19. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/disaggregation/prefill.py +43 -19
  20. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/disaggregation/utils.py +31 -0
  21. sglang-0.4.5.post1/sglang/srt/entrypoints/EngineBase.py +53 -0
  22. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/entrypoints/engine.py +36 -8
  23. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/entrypoints/http_server.py +37 -8
  24. sglang-0.4.5.post1/sglang/srt/entrypoints/http_server_engine.py +142 -0
  25. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/entrypoints/verl_engine.py +37 -10
  26. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/hf_transformers_utils.py +4 -0
  27. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashattention_backend.py +330 -200
  28. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashinfer_backend.py +13 -7
  29. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/vision.py +1 -1
  30. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/dp_attention.py +2 -4
  31. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/elementwise.py +15 -2
  32. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/linear.py +1 -0
  33. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
  34. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  35. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  36. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  37. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  38. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  39. sglang-0.4.5.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  40. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +38 -21
  41. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/router.py +7 -1
  42. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/topk.py +37 -16
  43. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/__init__.py +12 -5
  44. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
  45. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
  46. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/fp8.py +25 -13
  47. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/fp8_kernel.py +130 -4
  48. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/fp8_utils.py +34 -6
  49. sglang-0.4.5.post1/sglang/srt/layers/quantization/kv_cache.py +89 -0
  50. sglang-0.4.5.post1/sglang/srt/layers/quantization/modelopt_quant.py +463 -0
  51. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
  52. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/w8a8_int8.py +1 -0
  53. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/radix_attention.py +13 -1
  54. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/rotary_embedding.py +12 -1
  55. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/io_struct.py +254 -97
  56. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/mm_utils.py +3 -2
  57. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
  58. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
  59. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
  60. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/schedule_batch.py +62 -21
  61. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/scheduler.py +71 -14
  62. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/tokenizer_manager.py +17 -3
  63. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/tp_worker.py +1 -0
  64. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/memory_pool.py +14 -1
  65. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/metrics/collector.py +9 -0
  66. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_executor/cuda_graph_runner.py +7 -4
  67. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_executor/forward_batch_info.py +234 -15
  68. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_executor/model_runner.py +48 -9
  69. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_loader/loader.py +31 -4
  70. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_loader/weight_utils.py +4 -2
  71. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/baichuan.py +2 -0
  72. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/chatglm.py +1 -0
  73. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/commandr.py +1 -0
  74. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/dbrx.py +1 -0
  75. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek.py +1 -0
  76. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_v2.py +248 -61
  77. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/exaone.py +1 -0
  78. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma.py +1 -0
  79. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma2.py +1 -0
  80. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma3_causal.py +1 -0
  81. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gpt2.py +1 -0
  82. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gpt_bigcode.py +1 -0
  83. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/granite.py +1 -0
  84. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/grok.py +1 -0
  85. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/internlm2.py +1 -0
  86. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama.py +1 -0
  87. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama4.py +101 -34
  88. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/minicpm.py +1 -0
  89. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/minicpm3.py +2 -0
  90. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/mixtral.py +1 -0
  91. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/mixtral_quant.py +1 -0
  92. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/mllama.py +51 -8
  93. sglang-0.4.5.post1/sglang/srt/models/mllama4.py +227 -0
  94. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/olmo.py +1 -0
  95. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/olmo2.py +1 -0
  96. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/olmoe.py +1 -0
  97. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/phi3_small.py +1 -0
  98. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen.py +1 -0
  99. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2.py +1 -0
  100. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_5_vl.py +35 -70
  101. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_moe.py +1 -0
  102. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_vl.py +27 -25
  103. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/stablelm.py +1 -0
  104. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/xverse.py +1 -0
  105. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/xverse_moe.py +1 -0
  106. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/openai_api/adapter.py +4 -1
  107. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/patch_torch.py +11 -0
  108. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/server_args.py +34 -0
  109. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
  110. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/eagle_utils.py +1 -11
  111. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/eagle_worker.py +6 -2
  112. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/utils.py +120 -9
  113. sglang-0.4.5.post1/sglang/test/attention/test_flashattn_backend.py +350 -0
  114. sglang-0.4.5.post1/sglang/test/attention/test_flashattn_mla_backend.py +285 -0
  115. sglang-0.4.5.post1/sglang/test/attention/test_prefix_chunk_info.py +224 -0
  116. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_block_fp8.py +57 -0
  117. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_utils.py +19 -8
  118. sglang-0.4.5.post1/sglang/version.py +1 -0
  119. {sglang-0.4.5 → sglang-0.4.5.post1/sglang.egg-info}/PKG-INFO +14 -4
  120. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang.egg-info/SOURCES.txt +17 -3
  121. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang.egg-info/requires.txt +13 -2
  122. sglang-0.4.5/sglang/srt/disaggregation/conn.py +0 -81
  123. sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -146
  124. sglang-0.4.5/sglang/srt/layers/quantization/kv_cache.py +0 -98
  125. sglang-0.4.5/sglang/srt/layers/quantization/modelopt_quant.py +0 -196
  126. sglang-0.4.5/sglang/srt/models/mllama4.py +0 -154
  127. sglang-0.4.5/sglang/test/attention/test_flashattn_backend.py +0 -312
  128. sglang-0.4.5/sglang/version.py +0 -1
  129. {sglang-0.4.5 → sglang-0.4.5.post1}/LICENSE +0 -0
  130. {sglang-0.4.5 → sglang-0.4.5.post1}/setup.cfg +0 -0
  131. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/__init__.py +0 -0
  132. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/api.py +0 -0
  133. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/bench_offline_throughput.py +0 -0
  134. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/bench_one_batch_server.py +0 -0
  135. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/check_env.py +0 -0
  136. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/global_config.py +0 -0
  137. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/__init__.py +0 -0
  138. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/__init__.py +0 -0
  139. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/anthropic.py +0 -0
  140. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/base_backend.py +0 -0
  141. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/litellm.py +0 -0
  142. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/openai.py +0 -0
  143. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  144. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/backend/vertexai.py +0 -0
  145. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/chat_template.py +0 -0
  146. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/choices.py +0 -0
  147. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/compiler.py +0 -0
  148. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/interpreter.py +0 -0
  149. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/ir.py +0 -0
  150. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/lang/tracer.py +0 -0
  151. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/launch_server.py +0 -0
  152. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/llama3_eval.py +0 -0
  153. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/_custom_ops.py +0 -0
  154. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/aio_rwlock.py +0 -0
  155. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/code_completion_parser.py +0 -0
  156. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/__init__.py +0 -0
  157. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/chatglm.py +0 -0
  158. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/dbrx.py +0 -0
  159. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/deepseekvl2.py +0 -0
  160. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/device_config.py +0 -0
  161. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/exaone.py +0 -0
  162. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/janus_pro.py +0 -0
  163. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/load_config.py +0 -0
  164. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/configs/utils.py +0 -0
  165. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/__init__.py +0 -0
  166. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/base_connector.py +0 -0
  167. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/redis.py +0 -0
  168. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/s3.py +0 -0
  169. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/serde/__init__.py +0 -0
  170. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/serde/safe_serde.py +0 -0
  171. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/serde/serde.py +0 -0
  172. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/connector/utils.py +0 -0
  173. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  174. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/conversation.py +0 -0
  175. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/custom_op.py +0 -0
  176. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/__init__.py +0 -0
  177. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/communication_op.py +0 -0
  178. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  179. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  180. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  181. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  182. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  183. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  184. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  185. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  186. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/parallel_state.py +0 -0
  187. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/distributed/utils.py +0 -0
  188. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/function_call_parser.py +0 -0
  189. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/activation.py +0 -0
  190. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  191. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  192. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  193. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
  194. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  195. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  196. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  197. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  198. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  199. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  200. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  201. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/attention/utils.py +0 -0
  202. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/layernorm.py +0 -0
  203. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/logits_processor.py +0 -0
  204. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  205. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  206. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  207. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  208. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  209. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  210. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  211. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  212. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  213. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  214. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  215. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  216. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  217. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  218. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  219. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  220. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  221. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  222. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  223. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  224. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  225. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  226. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  227. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  228. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  229. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  230. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  231. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  232. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  233. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  234. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  235. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  236. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  237. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  238. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  239. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  240. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  241. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  242. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  243. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  244. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  245. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  246. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  247. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  248. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  249. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  250. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  251. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  252. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  253. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  254. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  255. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  256. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  257. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  258. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  259. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  260. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  261. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  262. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  263. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  264. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  265. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  266. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  267. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  268. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  269. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  270. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  271. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  272. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  273. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  274. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  275. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  276. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  277. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  278. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  279. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  280. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  281. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  282. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  283. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  284. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  285. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  286. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  287. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  288. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  289. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  290. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  291. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  292. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  293. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  294. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  295. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  296. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  297. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  298. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  299. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  300. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  301. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  302. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  303. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  304. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  305. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  306. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  307. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  308. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  309. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  310. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  311. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  312. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  313. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  314. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  315. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  316. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  317. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  318. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  319. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  320. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  321. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  322. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  323. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  324. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  325. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  326. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  327. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  328. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  329. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  330. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  331. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  332. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  333. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  334. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  335. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  336. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  337. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +0 -0
  338. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/parameter.py +0 -0
  339. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/pooler.py +0 -0
  340. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/awq.py +0 -0
  341. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  342. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
  343. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  344. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  345. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  346. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  347. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  348. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  349. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  350. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  351. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  352. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  353. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  354. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  355. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  357. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  358. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  364. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  368. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  372. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  488. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  489. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  490. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  491. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  492. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  493. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  494. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  495. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  496. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  497. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  498. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  499. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  500. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/gptq.py +0 -0
  501. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  502. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  503. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
  504. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/quantization/utils.py +0 -0
  505. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/sampler.py +0 -0
  506. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  507. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  508. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/backend/__init__.py +0 -0
  509. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/backend/base_backend.py +0 -0
  510. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  511. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/backend/triton_backend.py +0 -0
  512. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/layers.py +0 -0
  513. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/lora.py +0 -0
  514. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/lora_config.py +0 -0
  515. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/lora_manager.py +0 -0
  516. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/mem_pool.py +0 -0
  517. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  518. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  519. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  520. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  521. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  522. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/lora/utils.py +0 -0
  523. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/cache_controller.py +0 -0
  524. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/configure_logging.py +0 -0
  525. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  526. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  527. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/expert_distribution.py +0 -0
  528. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processor.py +0 -0
  529. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
  530. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +0 -0
  531. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/gemma3.py +0 -0
  532. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
  533. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/minicpm.py +0 -0
  534. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
  535. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/multimodal_processors/qwen_vl.py +0 -0
  536. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  537. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
  538. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/session_controller.py +0 -0
  539. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  540. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/managers/utils.py +0 -0
  541. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  542. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  543. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  544. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
  545. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/paged_allocator.py +0 -0
  546. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  547. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/metrics/func_timer.py +0 -0
  548. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/mm_utils.py +0 -0
  549. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_loader/__init__.py +0 -0
  550. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_loader/utils.py +0 -0
  551. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/model_parallel.py +0 -0
  552. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/clip.py +0 -0
  553. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  554. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_nextn.py +0 -0
  555. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/deepseek_vl2.py +0 -0
  556. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  557. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/gemma3_mm.py +0 -0
  558. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  559. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_classification.py +0 -0
  560. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_eagle.py +0 -0
  561. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_eagle3.py +0 -0
  562. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_embedding.py +0 -0
  563. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llama_reward.py +0 -0
  564. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llava.py +0 -0
  565. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/llavavid.py +0 -0
  566. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/minicpmo.py +0 -0
  567. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/minicpmv.py +0 -0
  568. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/mistral.py +0 -0
  569. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_classification.py +0 -0
  570. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
  571. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/qwen2_rm.py +0 -0
  572. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/registry.py +0 -0
  573. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  574. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/models/yivl.py +0 -0
  575. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/openai_api/protocol.py +0 -0
  576. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/platforms/interface.py +0 -0
  577. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/reasoning_parser.py +0 -0
  578. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  579. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  580. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  581. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  582. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  583. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  584. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  585. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  586. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/server.py +0 -0
  587. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  588. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/speculative/spec_info.py +0 -0
  589. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  590. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/srt/warmup.py +0 -0
  591. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/__init__.py +0 -0
  592. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/attention/__init__.py +0 -0
  593. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  594. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  595. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/run_eval.py +0 -0
  596. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/runners.py +0 -0
  597. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/send_one.py +0 -0
  598. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_common.py +0 -0
  599. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  600. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  601. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_math.py +0 -0
  602. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  603. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  604. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_activation.py +0 -0
  605. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_block_fp8_ep.py +0 -0
  606. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_custom_ops.py +0 -0
  607. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_dynamic_grad_mode.py +0 -0
  608. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_layernorm.py +0 -0
  609. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/test/test_programs.py +0 -0
  610. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang/utils.py +0 -0
  611. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang.egg-info/dependency_links.txt +0 -0
  612. {sglang-0.4.5 → sglang-0.4.5.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.5
3
+ Version: 0.4.5.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,20 +239,30 @@ Requires-Dist: python-multipart; extra == "runtime-common"
239
239
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
240
240
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
241
241
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
242
- Requires-Dist: transformers==4.51.0; extra == "runtime-common"
242
+ Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
245
  Requires-Dist: compressed-tensors; extra == "runtime-common"
246
246
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
247
247
  Provides-Extra: srt
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt"
249
- Requires-Dist: sgl-kernel==0.0.8; extra == "srt"
249
+ Requires-Dist: sgl-kernel==0.0.9.post1; extra == "srt"
250
250
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
251
251
  Requires-Dist: torch==2.5.1; extra == "srt"
252
+ Requires-Dist: torchvision==0.20.1; extra == "srt"
252
253
  Requires-Dist: cuda-python; extra == "srt"
253
254
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
254
255
  Requires-Dist: partial_json_parser; extra == "srt"
255
256
  Requires-Dist: einops; extra == "srt"
257
+ Provides-Extra: blackwell
258
+ Requires-Dist: sglang[runtime_common]; extra == "blackwell"
259
+ Requires-Dist: sgl-kernel; extra == "blackwell"
260
+ Requires-Dist: torch; extra == "blackwell"
261
+ Requires-Dist: torchvision; extra == "blackwell"
262
+ Requires-Dist: cuda-python; extra == "blackwell"
263
+ Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
264
+ Requires-Dist: partial_json_parser; extra == "blackwell"
265
+ Requires-Dist: einops; extra == "blackwell"
256
266
  Provides-Extra: srt-hip
257
267
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
258
268
  Requires-Dist: torch; extra == "srt-hip"
@@ -391,7 +401,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
391
401
 
392
402
  ## Adoption and Sponsorship
393
403
  The project has been deployed to large-scale production, generating trillions of tokens every day.
394
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
404
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
395
405
 
396
406
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
397
407
 
@@ -63,7 +63,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
63
63
 
64
64
  ## Adoption and Sponsorship
65
65
  The project has been deployed to large-scale production, generating trillions of tokens every day.
66
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
66
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
67
67
 
68
68
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
69
69
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.5"
7
+ version = "0.4.5.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -38,7 +38,7 @@ runtime_common = [
38
38
  "pyzmq>=25.1.2",
39
39
  "soundfile==0.13.1",
40
40
  "torchao>=0.7.0",
41
- "transformers==4.51.0",
41
+ "transformers==4.51.1",
42
42
  "uvicorn",
43
43
  "uvloop",
44
44
  "compressed-tensors",
@@ -47,9 +47,21 @@ runtime_common = [
47
47
 
48
48
  srt = [
49
49
  "sglang[runtime_common]",
50
- "sgl-kernel==0.0.8",
50
+ "sgl-kernel==0.0.9.post1",
51
51
  "flashinfer_python==0.2.3",
52
52
  "torch==2.5.1",
53
+ "torchvision==0.20.1",
54
+ "cuda-python",
55
+ "outlines>=0.0.44,<=0.1.11",
56
+ "partial_json_parser",
57
+ "einops",
58
+ ]
59
+
60
+ blackwell = [
61
+ "sglang[runtime_common]",
62
+ "sgl-kernel",
63
+ "torch",
64
+ "torchvision",
53
65
  "cuda-python",
54
66
  "outlines>=0.0.44,<=0.1.11",
55
67
  "partial_json_parser",
@@ -60,6 +60,7 @@ from sglang.srt.configs.model_config import ModelConfig
60
60
  from sglang.srt.entrypoints.engine import _set_envs_and_config
61
61
  from sglang.srt.hf_transformers_utils import get_tokenizer
62
62
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
63
+ from sglang.srt.managers.scheduler import Scheduler
63
64
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
64
65
  from sglang.srt.model_executor.model_runner import ModelRunner
65
66
  from sglang.srt.sampling.sampling_params import SamplingParams
@@ -135,6 +136,7 @@ def load_model(server_args, port_args, tp_rank):
135
136
  context_length=server_args.context_length,
136
137
  model_override_args=server_args.json_model_override_args,
137
138
  is_embedding=server_args.is_embedding,
139
+ enable_multimodal=server_args.enable_multimodal,
138
140
  dtype=server_args.dtype,
139
141
  quantization=server_args.quantization,
140
142
  )
@@ -184,6 +186,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
184
186
  req.prefix_indices = []
185
187
  req.fill_ids = req.origin_input_ids
186
188
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
189
+ req.logprob_start_len = len(req.origin_input_ids) - 1
187
190
  reqs.append(req)
188
191
 
189
192
  return input_ids, reqs
@@ -199,6 +202,7 @@ def prepare_extend_inputs_for_correctness_test(
199
202
  i, : bench_args.cut_len
200
203
  ]
201
204
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
205
+ req.logprob_start_len = len(req.origin_input_ids) - 1
202
206
  return reqs
203
207
 
204
208
 
@@ -220,6 +224,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
220
224
  req.prefix_indices = []
221
225
  req.fill_ids = req.origin_input_ids
222
226
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
227
+ req.logprob_start_len = len(req.origin_input_ids) - 1
223
228
  reqs.append(req)
224
229
 
225
230
  return reqs
@@ -238,6 +243,7 @@ def extend(reqs, model_runner):
238
243
  enable_custom_logit_processor=False,
239
244
  )
240
245
  batch.prepare_for_extend()
246
+ _maybe_prepare_dp_attn_batch(batch, model_runner)
241
247
  model_worker_batch = batch.get_model_worker_batch()
242
248
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
243
249
  logits_output = model_runner.forward(forward_batch)
@@ -249,6 +255,7 @@ def extend(reqs, model_runner):
249
255
  def decode(input_token_ids, batch, model_runner):
250
256
  batch.output_ids = input_token_ids
251
257
  batch.prepare_for_decode()
258
+ _maybe_prepare_dp_attn_batch(batch, model_runner)
252
259
  model_worker_batch = batch.get_model_worker_batch()
253
260
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
254
261
  logits_output = model_runner.forward(forward_batch)
@@ -256,6 +263,20 @@ def decode(input_token_ids, batch, model_runner):
256
263
  return next_token_ids, logits_output.next_token_logits
257
264
 
258
265
 
266
+ def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
267
+ if model_runner.server_args.enable_dp_attention:
268
+ Scheduler.prepare_dp_attn_batch_raw(
269
+ batch,
270
+ dp_size=model_runner.server_args.dp_size,
271
+ attn_tp_size=1,
272
+ tp_cpu_group=model_runner.tp_group.cpu_group,
273
+ get_idle_batch=None,
274
+ disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
275
+ spec_algorithm=SpeculativeAlgorithm.NONE,
276
+ speculative_num_draft_tokens=None,
277
+ )
278
+
279
+
259
280
  def correctness_test(
260
281
  server_args,
261
282
  port_args,
@@ -490,7 +490,7 @@ def get_dataset(args, tokenizer):
490
490
  prompt_suffix=args.prompt_suffix,
491
491
  apply_chat_template=args.apply_chat_template,
492
492
  )
493
- elif args.dataset_name == "random":
493
+ elif args.dataset_name.startswith("random"):
494
494
  input_requests = sample_random_requests(
495
495
  input_len=args.random_input_len,
496
496
  output_len=args.random_output_len,
@@ -498,6 +498,7 @@ def get_dataset(args, tokenizer):
498
498
  range_ratio=args.random_range_ratio,
499
499
  tokenizer=tokenizer,
500
500
  dataset_path=args.dataset_path,
501
+ random_sample=args.dataset_name == "random",
501
502
  )
502
503
  elif args.dataset_name == "generated-shared-prefix":
503
504
  input_requests = sample_generated_shared_prefix_requests(
@@ -687,6 +688,7 @@ def sample_random_requests(
687
688
  range_ratio: float,
688
689
  tokenizer: PreTrainedTokenizerBase,
689
690
  dataset_path: str,
691
+ random_sample: bool = True,
690
692
  ) -> List[Tuple[str, int, int]]:
691
693
 
692
694
  input_lens = np.random.randint(
@@ -700,11 +702,15 @@ def sample_random_requests(
700
702
  size=num_prompts,
701
703
  )
702
704
 
703
- if True:
705
+ if random_sample:
704
706
  # Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
705
707
 
706
708
  # Download sharegpt if necessary
707
709
  if not os.path.isfile(dataset_path):
710
+ print(
711
+ "If you do not want to randomly sample from a dataset,"
712
+ " please use --dataset-name random-ids."
713
+ )
708
714
  dataset_path = download_and_cache_file(SHAREGPT_URL)
709
715
 
710
716
  # Load the dataset.
@@ -1223,7 +1229,7 @@ async def benchmark(
1223
1229
  output_file_name = args.output_file
1224
1230
  else:
1225
1231
  now = datetime.now().strftime("%m%d")
1226
- if args.dataset_name == "random":
1232
+ if args.dataset_name.startswith("random"):
1227
1233
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
1228
1234
  else:
1229
1235
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
@@ -1442,7 +1448,7 @@ if __name__ == "__main__":
1442
1448
  "--dataset-name",
1443
1449
  type=str,
1444
1450
  default="sharegpt",
1445
- choices=["sharegpt", "random", "generated-shared-prefix"],
1451
+ choices=["sharegpt", "random", "random-ids", "generated-shared-prefix"],
1446
1452
  help="Name of the dataset to benchmark on.",
1447
1453
  )
1448
1454
  parser.add_argument(
@@ -15,6 +15,7 @@
15
15
  import json
16
16
  import logging
17
17
  import math
18
+ import os
18
19
  from enum import IntEnum, auto
19
20
  from typing import List, Optional, Set, Union
20
21
 
@@ -42,10 +43,12 @@ class ModelConfig:
42
43
  context_length: Optional[int] = None,
43
44
  model_override_args: Optional[str] = None,
44
45
  is_embedding: Optional[bool] = None,
46
+ enable_multimodal: Optional[bool] = None,
45
47
  dtype: str = "auto",
46
48
  quantization: Optional[str] = None,
47
49
  override_config_file: Optional[str] = None,
48
50
  ) -> None:
51
+
49
52
  self.model_path = model_path
50
53
  self.revision = revision
51
54
  self.quantization = quantization
@@ -69,14 +72,28 @@ class ModelConfig:
69
72
  self.hf_text_config, "attention_chunk_size", None
70
73
  )
71
74
 
75
+ if enable_multimodal is None:
76
+ if self.hf_config.architectures == "Llama4ForConditionalGeneration":
77
+ enable_multimodal = False
78
+ else:
79
+ enable_multimodal = True
80
+
72
81
  # Check model type
73
82
  self.is_generation = is_generation_model(
74
83
  self.hf_config.architectures, is_embedding
75
84
  )
76
- self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
77
- self.is_multimodal_gen = is_multimodal_gen_model(self.hf_config.architectures)
78
- self.is_image_gen = is_image_gen_model(self.hf_config.architectures)
79
- self.is_audio_model = is_audio_model(self.hf_config.architectures)
85
+ self.is_multimodal = enable_multimodal and is_multimodal_model(
86
+ self.hf_config.architectures
87
+ )
88
+ self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
89
+ self.hf_config.architectures
90
+ )
91
+ self.is_image_gen = enable_multimodal and is_image_gen_model(
92
+ self.hf_config.architectures
93
+ )
94
+ self.is_audio_model = enable_multimodal and is_audio_model(
95
+ self.hf_config.architectures
96
+ )
80
97
  self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
81
98
  self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
82
99
 
@@ -234,6 +251,20 @@ class ModelConfig:
234
251
  if quant_cfg is None:
235
252
  # compressed-tensors uses a "compression_config" key
236
253
  quant_cfg = getattr(self.hf_config, "compression_config", None)
254
+ if quant_cfg is None:
255
+ # check if is modelopt model -- modelopt doesn't have corresponding field
256
+ # in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
257
+ # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
258
+ is_local = os.path.exists(self.model_path)
259
+ modelopt_quant_config = {"quant_method": "modelopt"}
260
+ if not is_local:
261
+ from huggingface_hub import HfApi
262
+
263
+ hf_api = HfApi()
264
+ if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
265
+ quant_cfg = modelopt_quant_config
266
+ elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
267
+ quant_cfg = modelopt_quant_config
237
268
  return quant_cfg
238
269
 
239
270
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -264,6 +295,7 @@ class ModelConfig:
264
295
  "moe_wna16",
265
296
  ]
266
297
  compatible_quantization_methods = {
298
+ "modelopt_fp4": ["modelopt"],
267
299
  "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
268
300
  "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
269
301
  }
@@ -470,8 +502,8 @@ multimodal_model_archs = [
470
502
  "Gemma3ForConditionalGeneration",
471
503
  "Grok1VForCausalLM",
472
504
  "Grok1AForCausalLM",
473
- # TODO: add multimodal support for "Llama4ForConditionalGeneration",
474
505
  "LlavaLlamaForCausalLM",
506
+ "Llama4ForConditionalGeneration",
475
507
  "LlavaMistralForCausalLM",
476
508
  "LlavaQwenForCausalLM",
477
509
  "LlavaVidForCausalLM",
@@ -28,6 +28,18 @@ logger = logging.getLogger(__name__)
28
28
 
29
29
 
30
30
  class BaseGrammarObject(ABC):
31
+
32
+ def __init__(self):
33
+ self._finished = False
34
+
35
+ @property
36
+ def finished(self):
37
+ return self._finished
38
+
39
+ @finished.setter
40
+ def finished(self, finished):
41
+ self._finished = finished
42
+
31
43
  @abstractmethod
32
44
  def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
33
45
  """
@@ -59,6 +71,13 @@ class BaseGrammarObject(ABC):
59
71
  """
60
72
  raise NotImplementedError
61
73
 
74
+ @abstractmethod
75
+ def accept_token(self, token: int) -> None:
76
+ """
77
+ Accept a token in the grammar.
78
+ """
79
+ raise NotImplementedError
80
+
62
81
  @abstractmethod
63
82
  def allocate_vocab_mask(
64
83
  self, vocab_size: int, batch_size: int, device
@@ -90,7 +109,7 @@ class CacheEntry:
90
109
  event: Event
91
110
 
92
111
 
93
- class BaseGrammarBackend(ABC):
112
+ class BaseGrammarBackend:
94
113
  def __init__(self):
95
114
  self.executor = ThreadPoolExecutor()
96
115
  self.cache: Dict[Tuple[str, str], CacheEntry] = {}
@@ -107,19 +126,15 @@ class BaseGrammarBackend(ABC):
107
126
  """
108
127
  raise ValueError(f"Invalid key_type: {key_type}={key_string}")
109
128
 
110
- @abstractmethod
111
129
  def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]:
112
130
  return self._not_supported("json", key_string)
113
131
 
114
- @abstractmethod
115
132
  def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]:
116
133
  return self._not_supported("regex", key_string)
117
134
 
118
- @abstractmethod
119
135
  def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
120
136
  return self._not_supported("ebnf", key_string)
121
137
 
122
- @abstractmethod
123
138
  def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
124
139
  return self._not_supported("structural_tag", key_string)
125
140
 
@@ -195,4 +210,10 @@ def create_grammar_backend(
195
210
  else:
196
211
  raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
197
212
 
213
+ if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"):
214
+ from .reasoner_grammar_backend import ReasonerGrammarBackend
215
+
216
+ grammar_backend = ReasonerGrammarBackend(
217
+ grammar_backend, tokenizer.think_end_id
218
+ )
198
219
  return grammar_backend
@@ -33,6 +33,7 @@ class GuidanceGrammar(BaseGrammarObject):
33
33
  def __init__(
34
34
  self, llguidance_tokenizer: llguidance.LLTokenizer, serialized_grammar: str
35
35
  ):
36
+ super().__init__()
36
37
  self.llguidance_tokenizer = llguidance_tokenizer
37
38
  self.serialized_grammar = serialized_grammar
38
39
 
@@ -44,6 +44,7 @@ class OutlinesGrammar(BaseGrammarObject):
44
44
  guide: RegexGuide,
45
45
  jump_forward_map: Union[OutlinesJumpForwardMap, None],
46
46
  ) -> None:
47
+ super().__init__()
47
48
  self.guide = guide
48
49
  self.jump_forward_map = jump_forward_map
49
50
  self.state = 0
@@ -0,0 +1,101 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ """The baseclass of a backend for reasoner grammar-guided constrained decoding."""
15
+
16
+ from concurrent.futures import Future
17
+ from typing import List, Optional, Tuple
18
+
19
+ import torch
20
+
21
+ from .base_grammar_backend import BaseGrammarBackend, BaseGrammarObject
22
+
23
+
24
+ class ReasonerGrammarObject(BaseGrammarObject):
25
+ def __init__(self, grammar: BaseGrammarObject, think_end_id):
26
+ super().__init__()
27
+ self.grammar = grammar
28
+ self.think_end_id = think_end_id
29
+ self.is_in_reasoning = True
30
+
31
+ @property
32
+ def finished(self):
33
+ return self.grammar.finished
34
+
35
+ @finished.setter
36
+ def finished(self, finished):
37
+ self.grammar.finished = finished
38
+
39
+ def allocate_vocab_mask(
40
+ self, vocab_size: int, batch_size: int, device
41
+ ) -> torch.Tensor:
42
+ return self.grammar.allocate_vocab_mask(vocab_size, batch_size, device)
43
+
44
+ def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
45
+ if not self.is_in_reasoning:
46
+ self.grammar.fill_vocab_mask(vocab_mask, idx)
47
+
48
+ def move_vocab_mask(self, vocab_mask: torch.Tensor, device) -> torch.Tensor:
49
+ return self.grammar.move_vocab_mask(vocab_mask, device)
50
+
51
+ @property
52
+ def apply_vocab_mask(self):
53
+ return self.grammar.apply_vocab_mask
54
+
55
+ def accept_token(self, token: int):
56
+ if token == self.think_end_id:
57
+ self.is_in_reasoning = False
58
+
59
+ if not self.is_in_reasoning and token != self.think_end_id:
60
+ self.grammar.accept_token(token)
61
+
62
+ def try_jump_forward(self, tokenizer):
63
+ return self.grammar.try_jump_forward(tokenizer)
64
+
65
+ def jump_forward_str_state(self, helper):
66
+ return self.grammar.jump_forward_str_state(helper)
67
+
68
+ def jump_and_retokenize(
69
+ self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
70
+ ):
71
+ return self.grammar.jump_and_retokenize(
72
+ old_output_ids, new_output_ids, next_state
73
+ )
74
+
75
+ def copy(self) -> BaseGrammarObject:
76
+ return ReasonerGrammarObject(self.grammar.copy(), self.think_end_id)
77
+
78
+
79
+ class ReasonerGrammarBackend(BaseGrammarBackend):
80
+ def __init__(self, grammar_backend: BaseGrammarBackend, think_end_id):
81
+ self.grammar_backend = grammar_backend
82
+ self.think_end_id = think_end_id
83
+
84
+ def get_cached_value(self, key: Tuple[str, str]) -> Optional[ReasonerGrammarObject]:
85
+ grammar = self.grammar_backend.get_cached_value(key)
86
+ return ReasonerGrammarObject(grammar, self.think_end_id) if grammar else None
87
+
88
+ def get_future_value(self, key: Tuple[str, str]) -> Future:
89
+ grammar = Future()
90
+
91
+ def callback(f: Future):
92
+ if result := f.result():
93
+ grammar.set_result(ReasonerGrammarObject(result, self.think_end_id))
94
+ else:
95
+ grammar.set_result(None)
96
+
97
+ self.grammar_backend.get_future_value(key).add_done_callback(callback)
98
+ return grammar
99
+
100
+ def reset(self):
101
+ self.grammar_backend.reset()
@@ -48,6 +48,7 @@ class XGrammarGrammar(BaseGrammarObject):
48
48
  ctx: CompiledGrammar,
49
49
  override_stop_tokens: Optional[Union[List[int], int]],
50
50
  ) -> None:
51
+ super().__init__()
51
52
  self.matcher = matcher
52
53
  self.vocab_size = vocab_size
53
54
  self.ctx = ctx
@@ -0,0 +1,8 @@
1
+ from .conn import (
2
+ BaseKVBootstrapServer,
3
+ BaseKVManager,
4
+ BaseKVReceiver,
5
+ BaseKVSender,
6
+ KVArgs,
7
+ KVPoll,
8
+ )
@@ -0,0 +1,113 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional
3
+
4
+ import numpy as np
5
+ import numpy.typing as npt
6
+
7
+ from sglang.srt.disaggregation.utils import DisaggregationMode
8
+ from sglang.srt.server_args import ServerArgs
9
+
10
+
11
+ class KVArgs:
12
+ engine_rank: int
13
+ kv_data_ptrs: list[int]
14
+ kv_data_lens: list[int]
15
+ kv_item_lens: list[int]
16
+ aux_data_ptrs: list[int]
17
+ aux_data_lens: list[int]
18
+ aux_item_lens: list[int]
19
+ ib_device: str
20
+ gpu_id: int
21
+
22
+
23
+ class KVPoll:
24
+ Failed = 0
25
+ Bootstrapping = 1
26
+ WaitingForInput = 2
27
+ Transferring = 3
28
+ Success = 4
29
+
30
+
31
+ class BaseKVManager(ABC):
32
+ """Base class for managing transfers states"""
33
+
34
+ @abstractmethod
35
+ def __init__(
36
+ self,
37
+ args: KVArgs,
38
+ disaggregation_mode: DisaggregationMode,
39
+ server_args: ServerArgs,
40
+ ): ...
41
+
42
+
43
+ class BaseKVSender(ABC):
44
+
45
+ @abstractmethod
46
+ def __init__(
47
+ self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int
48
+ ): ...
49
+
50
+ @abstractmethod
51
+ def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
52
+ """
53
+ Notify the decoder server about the kv indices length and aux index
54
+ """
55
+ ...
56
+
57
+ @abstractmethod
58
+ def send(self, kv_indices: npt.NDArray[np.int64]):
59
+ """
60
+ Send the kv cache at the given kv indices to the decoder server
61
+ """
62
+ ...
63
+
64
+ @abstractmethod
65
+ def poll(self) -> KVPoll:
66
+ """
67
+ Check the status of the kv cache transfer
68
+ """
69
+ ...
70
+
71
+ @abstractmethod
72
+ def failure_exception(self):
73
+ """
74
+ Raise an exception if the kv cache transfer fails
75
+ """
76
+ ...
77
+
78
+
79
+ class BaseKVReceiver(ABC):
80
+
81
+ @abstractmethod
82
+ def __init__(
83
+ self,
84
+ mgr: BaseKVManager,
85
+ bootstrap_addr: str,
86
+ bootstrap_room: Optional[int] = None,
87
+ ): ...
88
+
89
+ @abstractmethod
90
+ def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
91
+ """
92
+ Notify the prefill server about the kv indices and aux index
93
+ """
94
+ ...
95
+
96
+ @abstractmethod
97
+ def poll(self) -> KVPoll:
98
+ """
99
+ Check the status of the kv cache transfer
100
+ """
101
+ ...
102
+
103
+ @abstractmethod
104
+ def failure_exception(self):
105
+ """
106
+ Raise an exception if the kv cache transfer fails
107
+ """
108
+ ...
109
+
110
+
111
+ class BaseKVBootstrapServer(ABC):
112
+ @abstractmethod
113
+ def __init__(self, port: int): ...