sglang 0.4.9__tar.gz → 0.4.9.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (751) hide show
  1. {sglang-0.4.9/sglang.egg-info → sglang-0.4.9.post2}/PKG-INFO +6 -4
  2. {sglang-0.4.9 → sglang-0.4.9.post2}/README.md +1 -1
  3. {sglang-0.4.9 → sglang-0.4.9.post2}/pyproject.toml +5 -3
  4. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/bench_serving.py +2 -2
  5. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/model_config.py +36 -2
  6. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/conversation.py +56 -3
  7. sglang-0.4.9.post2/sglang/srt/disaggregation/ascend/__init__.py +6 -0
  8. sglang-0.4.9.post2/sglang/srt/disaggregation/ascend/conn.py +44 -0
  9. sglang-0.4.9.post2/sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
  10. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mooncake/conn.py +50 -18
  11. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
  12. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/utils.py +25 -3
  13. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/engine.py +1 -1
  14. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/http_server.py +1 -0
  15. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/http_server_engine.py +1 -1
  16. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/protocol.py +11 -0
  17. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_chat.py +7 -0
  18. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/function_call_parser.py +2 -0
  19. sglang-0.4.9.post2/sglang/srt/function_call/kimik2_detector.py +220 -0
  20. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/hf_transformers_utils.py +18 -0
  21. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/jinja_template_utils.py +8 -0
  22. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/communicator.py +20 -5
  23. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
  24. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/layernorm.py +2 -2
  25. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/linear.py +12 -2
  26. sglang-0.4.9.post2/sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
  27. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/ep_moe/kernels.py +60 -1
  28. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/ep_moe/layer.py +141 -2
  29. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
  30. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/layer.py +141 -59
  31. sglang-0.4.9.post2/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
  32. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/topk.py +8 -2
  33. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/parameter.py +19 -3
  34. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/__init__.py +2 -0
  35. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/fp8.py +28 -7
  36. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  37. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/modelopt_quant.py +244 -1
  38. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/moe_wna16.py +1 -2
  39. sglang-0.4.9.post2/sglang/srt/layers/quantization/w4afp8.py +264 -0
  40. sglang-0.4.9.post2/sglang/srt/layers/quantization/w8a8_int8.py +1047 -0
  41. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/vocab_parallel_embedding.py +9 -3
  42. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
  43. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
  44. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
  45. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
  46. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/cache_controller.py +41 -195
  47. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/io_struct.py +35 -3
  48. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/mm_utils.py +59 -96
  49. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/schedule_batch.py +17 -6
  50. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/scheduler.py +38 -6
  51. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/tokenizer_manager.py +16 -0
  52. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/mem_cache/hiradix_cache.py +2 -0
  53. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/mem_cache/memory_pool.py +176 -101
  54. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/mem_cache/memory_pool_host.py +6 -109
  55. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/mem_cache/radix_cache.py +8 -4
  56. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/model_executor/forward_batch_info.py +13 -1
  57. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/model_loader/loader.py +23 -12
  58. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/deepseek_janus_pro.py +1 -1
  59. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/deepseek_v2.py +78 -19
  60. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/deepseek_vl2.py +1 -1
  61. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gemma3_mm.py +1 -1
  62. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gemma3n_mm.py +6 -3
  63. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/internvl.py +8 -2
  64. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/kimi_vl.py +8 -2
  65. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/llama.py +2 -0
  66. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/llava.py +3 -1
  67. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/llavavid.py +1 -1
  68. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/minicpmo.py +1 -2
  69. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/minicpmv.py +1 -1
  70. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/mixtral_quant.py +4 -0
  71. sglang-0.4.9.post2/sglang/srt/models/mllama4.py +549 -0
  72. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/phi4mm.py +8 -2
  73. sglang-0.4.9.post2/sglang/srt/models/phimoe.py +553 -0
  74. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen2.py +2 -0
  75. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_5_vl.py +10 -7
  76. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_vl.py +12 -1
  77. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/vila.py +8 -2
  78. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/mm_utils.py +2 -2
  79. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/base_processor.py +197 -137
  80. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
  81. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/gemma3.py +4 -2
  82. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/gemma3n.py +1 -1
  83. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/internvl.py +1 -1
  84. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/janus_pro.py +1 -1
  85. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/kimi_vl.py +1 -1
  86. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/minicpm.py +4 -3
  87. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/mllama4.py +63 -61
  88. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/phi4mm.py +1 -1
  89. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/pixtral.py +1 -1
  90. sglang-0.4.9.post2/sglang/srt/multimodal/processors/qwen_vl.py +286 -0
  91. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/vila.py +1 -1
  92. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/server_args.py +26 -4
  93. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/two_batch_overlap.py +3 -0
  94. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/utils.py +191 -48
  95. sglang-0.4.9.post2/sglang/test/test_cutlass_w4a8_moe.py +281 -0
  96. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/utils.py +5 -5
  97. sglang-0.4.9.post2/sglang/version.py +1 -0
  98. {sglang-0.4.9 → sglang-0.4.9.post2/sglang.egg-info}/PKG-INFO +6 -4
  99. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang.egg-info/SOURCES.txt +9 -0
  100. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang.egg-info/requires.txt +4 -2
  101. sglang-0.4.9/sglang/srt/layers/quantization/w8a8_int8.py +0 -323
  102. sglang-0.4.9/sglang/srt/models/mllama4.py +0 -259
  103. sglang-0.4.9/sglang/srt/multimodal/processors/qwen_vl.py +0 -163
  104. sglang-0.4.9/sglang/version.py +0 -1
  105. {sglang-0.4.9 → sglang-0.4.9.post2}/LICENSE +0 -0
  106. {sglang-0.4.9 → sglang-0.4.9.post2}/setup.cfg +0 -0
  107. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/__init__.py +0 -0
  108. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/api.py +0 -0
  109. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/bench_offline_throughput.py +0 -0
  110. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/bench_one_batch.py +0 -0
  111. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/bench_one_batch_server.py +0 -0
  112. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/check_env.py +0 -0
  113. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/compile_deep_gemm.py +0 -0
  114. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/eval/llama3_eval.py +0 -0
  115. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/eval/loogle_eval.py +0 -0
  116. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/global_config.py +0 -0
  117. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/backend/__init__.py +0 -0
  118. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/backend/anthropic.py +0 -0
  119. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/backend/base_backend.py +0 -0
  120. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/backend/litellm.py +0 -0
  121. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/backend/openai.py +0 -0
  122. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
  123. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/backend/vertexai.py +0 -0
  124. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/chat_template.py +0 -0
  125. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/choices.py +0 -0
  126. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/compiler.py +0 -0
  127. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/interpreter.py +0 -0
  128. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/ir.py +0 -0
  129. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/lang/tracer.py +0 -0
  130. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/launch_server.py +0 -0
  131. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/profiler.py +0 -0
  132. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/_custom_ops.py +0 -0
  133. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/aio_rwlock.py +0 -0
  134. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/code_completion_parser.py +0 -0
  135. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/__init__.py +0 -0
  136. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/chatglm.py +0 -0
  137. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/dbrx.py +0 -0
  138. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/deepseekvl2.py +0 -0
  139. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/device_config.py +0 -0
  140. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/exaone.py +0 -0
  141. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/internvl.py +0 -0
  142. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/janus_pro.py +0 -0
  143. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/kimi_vl.py +0 -0
  144. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/kimi_vl_moonvit.py +0 -0
  145. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/load_config.py +0 -0
  146. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/update_config.py +0 -0
  147. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/utils.py +0 -0
  148. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/connector/__init__.py +0 -0
  149. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/connector/base_connector.py +0 -0
  150. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/connector/redis.py +0 -0
  151. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/connector/s3.py +0 -0
  152. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/connector/serde/__init__.py +0 -0
  153. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/connector/serde/safe_serde.py +0 -0
  154. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/connector/serde/serde.py +0 -0
  155. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/connector/utils.py +0 -0
  156. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/constants.py +0 -0
  157. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  158. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/constrained/llguidance_backend.py +0 -0
  159. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/constrained/outlines_backend.py +0 -0
  160. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  161. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
  162. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
  163. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  164. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/custom_op.py +0 -0
  165. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/debug_utils.py +0 -0
  166. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/base/__init__.py +0 -0
  167. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/base/conn.py +0 -0
  168. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/common/__init__.py +0 -0
  169. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/common/conn.py +0 -0
  170. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/common/utils.py +0 -0
  171. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/decode.py +0 -0
  172. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/decode_schedule_batch_mixin.py +0 -0
  173. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/fake/__init__.py +0 -0
  174. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/fake/conn.py +0 -0
  175. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/kv_events.py +0 -0
  176. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/launch_lb.py +0 -0
  177. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mini_lb.py +0 -0
  178. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
  179. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
  180. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/nixl/conn.py +0 -0
  181. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/prefill.py +0 -0
  182. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/__init__.py +0 -0
  183. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/communication_op.py +0 -0
  184. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  185. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  186. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  187. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  188. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/npu_communicator.py +0 -0
  189. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/pymscclpp.py +0 -0
  190. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  191. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  192. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  193. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  194. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/parallel_state.py +0 -0
  195. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/distributed/utils.py +0 -0
  196. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/EngineBase.py +0 -0
  197. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/__init__.py +0 -0
  198. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_base.py +0 -0
  199. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_completions.py +0 -0
  200. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_embedding.py +0 -0
  201. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_rerank.py +0 -0
  202. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_score.py +0 -0
  203. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/usage_processor.py +0 -0
  204. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/utils.py +0 -0
  205. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/__init__.py +0 -0
  206. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_algorithms/__init__.py +0 -0
  207. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -0
  208. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py +0 -0
  209. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_manager.py +0 -0
  210. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_simulator/__init__.py +0 -0
  211. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_simulator/reader.py +0 -0
  212. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/expert_distribution.py +0 -0
  213. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/expert_location.py +0 -0
  214. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/expert_location_dispatch.py +0 -0
  215. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/eplb/expert_location_updater.py +0 -0
  216. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/base_format_detector.py +0 -0
  217. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/core_types.py +0 -0
  218. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/deepseekv3_detector.py +0 -0
  219. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/ebnf_composer.py +0 -0
  220. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/llama32_detector.py +0 -0
  221. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/mistral_detector.py +0 -0
  222. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/pythonic_detector.py +0 -0
  223. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/qwen25_detector.py +0 -0
  224. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/function_call/utils.py +0 -0
  225. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/activation.py +0 -0
  226. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/amx_utils.py +0 -0
  227. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/aiter_backend.py +0 -0
  228. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/ascend_backend.py +0 -0
  229. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  230. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/cutlass_mla_backend.py +0 -0
  231. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  232. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/flashattention_backend.py +0 -0
  233. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  234. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  235. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
  236. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/intel_amx_backend.py +0 -0
  237. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/merge_state.py +0 -0
  238. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/tbo_backend.py +0 -0
  239. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  240. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_backend.py +0 -0
  241. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  242. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  243. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  244. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/merge_state.py +0 -0
  245. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  246. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  247. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/utils.py +0 -0
  248. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/attention/vision.py +0 -0
  249. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/dp_attention.py +0 -0
  250. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/elementwise.py +0 -0
  251. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/logits_processor.py +0 -0
  252. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/cutlass_moe.py +0 -0
  253. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/cutlass_moe_params.py +0 -0
  254. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  255. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +0 -0
  256. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  257. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  258. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  259. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  260. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  261. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  262. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  263. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  264. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  265. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  266. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  267. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  268. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  269. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  270. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  271. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  272. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  273. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  274. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  275. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  276. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  277. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  278. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  279. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  280. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  281. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  282. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  283. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  284. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  285. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  286. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  287. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  288. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  289. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  290. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  291. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  292. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  293. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  294. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  295. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  296. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  297. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  298. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  299. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  300. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  301. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  302. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  303. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  304. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  305. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  306. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  307. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  308. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  309. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  310. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  311. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  312. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  313. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  314. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  315. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  316. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  317. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  318. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  319. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  320. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  321. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  322. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  323. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  324. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  325. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  326. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  327. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  328. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  329. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  330. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  331. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  332. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  333. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  334. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  335. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  336. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  337. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  338. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  339. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  340. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  341. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  342. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  343. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  344. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  345. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  346. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  347. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  348. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  349. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  350. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  351. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  352. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  353. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  354. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  355. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  356. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  357. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  358. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  359. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  360. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  361. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  362. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  363. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  364. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  365. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  366. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  367. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  368. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  369. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  370. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  371. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  372. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  373. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  374. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  375. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  376. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  377. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  378. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  379. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  380. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  381. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  382. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  383. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  384. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  385. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  386. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  387. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json +0 -0
  388. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json +0 -0
  389. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  390. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json +0 -0
  392. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json +0 -0
  394. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  395. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  396. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  397. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json +0 -0
  399. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json +0 -0
  401. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json +0 -0
  402. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  408. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  412. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  415. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  416. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  417. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  420. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/moe/router.py +0 -0
  421. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/multimodal.py +0 -0
  422. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/pooler.py +0 -0
  423. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/awq.py +0 -0
  424. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
  425. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
  426. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  427. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
  428. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  429. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  430. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  431. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
  432. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  433. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  434. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  488. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  489. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  490. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  491. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  492. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  493. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  494. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  495. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  496. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  497. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  498. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  499. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  500. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  501. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  502. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  503. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  504. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  505. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  506. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  507. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  508. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  509. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  510. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  511. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  512. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  513. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  514. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  515. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  516. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  517. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  518. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  519. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  520. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  521. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  522. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  523. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  524. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  525. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  526. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  527. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  528. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  529. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  530. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  531. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  532. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  533. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  534. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  535. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  536. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  537. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  538. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  539. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  540. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  541. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  542. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  543. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  544. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  545. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  546. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  547. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  548. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  549. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  550. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  551. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  552. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  553. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  554. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  555. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  556. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  557. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  558. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  559. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  560. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  561. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  562. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  563. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  564. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  565. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  566. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  567. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  568. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  569. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  570. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  571. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  572. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  573. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  574. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  575. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  576. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  577. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  578. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  579. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  580. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  581. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  582. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  583. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  584. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  585. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  586. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +0 -0
  587. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +0 -0
  588. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -0
  589. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +0 -0
  590. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  591. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/gptq.py +0 -0
  592. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  593. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  594. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/kv_cache.py +0 -0
  595. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/qoq.py +0 -0
  596. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/quant_utils.py +0 -0
  597. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/utils.py +0 -0
  598. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
  599. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/radix_attention.py +0 -0
  600. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/rotary_embedding.py +0 -0
  601. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/sampler.py +0 -0
  602. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  603. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/layers/utils.py +0 -0
  604. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/backend/base_backend.py +0 -0
  605. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  606. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/backend/triton_backend.py +0 -0
  607. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/layers.py +0 -0
  608. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/lora.py +0 -0
  609. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/lora_config.py +0 -0
  610. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/lora_manager.py +0 -0
  611. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/mem_pool.py +0 -0
  612. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  613. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/lora/utils.py +0 -0
  614. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/configure_logging.py +0 -0
  615. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/data_parallel_controller.py +0 -0
  616. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/detokenizer_manager.py +0 -0
  617. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/multimodal_processor.py +0 -0
  618. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -0
  619. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/schedule_policy.py +0 -0
  620. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
  621. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/session_controller.py +0 -0
  622. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/template_manager.py +0 -0
  623. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/tp_worker.py +0 -0
  624. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  625. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/managers/utils.py +0 -0
  626. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/mem_cache/allocator.py +0 -0
  627. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  628. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  629. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  630. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/mem_cache/multimodal_cache.py +0 -0
  631. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/metrics/collector.py +0 -0
  632. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/metrics/func_timer.py +0 -0
  633. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
  634. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/model_executor/model_runner.py +0 -0
  635. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/model_loader/__init__.py +0 -0
  636. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/model_loader/utils.py +0 -0
  637. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/model_loader/weight_utils.py +0 -0
  638. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/model_parallel.py +0 -0
  639. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/baichuan.py +0 -0
  640. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/bert.py +0 -0
  641. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/chatglm.py +0 -0
  642. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/clip.py +0 -0
  643. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/commandr.py +0 -0
  644. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/dbrx.py +0 -0
  645. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/deepseek.py +0 -0
  646. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/deepseek_nextn.py +0 -0
  647. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/exaone.py +0 -0
  648. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gemma.py +0 -0
  649. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gemma2.py +0 -0
  650. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gemma2_reward.py +0 -0
  651. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gemma3_causal.py +0 -0
  652. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gemma3n_audio.py +0 -0
  653. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gemma3n_causal.py +0 -0
  654. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/glm4.py +0 -0
  655. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gpt2.py +0 -0
  656. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
  657. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/granite.py +0 -0
  658. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/grok.py +0 -0
  659. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/hunyuan.py +0 -0
  660. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/idefics2.py +0 -0
  661. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/internlm2.py +0 -0
  662. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/internlm2_reward.py +0 -0
  663. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/kimi_vl_moonvit.py +0 -0
  664. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/llama4.py +0 -0
  665. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/llama_classification.py +0 -0
  666. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/llama_eagle.py +0 -0
  667. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/llama_eagle3.py +0 -0
  668. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/llama_embedding.py +0 -0
  669. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/llama_reward.py +0 -0
  670. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/mimo.py +0 -0
  671. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/mimo_mtp.py +0 -0
  672. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/minicpm.py +0 -0
  673. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/minicpm3.py +0 -0
  674. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/mistral.py +0 -0
  675. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/mixtral.py +0 -0
  676. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/mllama.py +0 -0
  677. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/olmo.py +0 -0
  678. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/olmo2.py +0 -0
  679. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/olmoe.py +0 -0
  680. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/phi3_small.py +0 -0
  681. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/pixtral.py +0 -0
  682. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen.py +0 -0
  683. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_audio.py +0 -0
  684. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_classification.py +0 -0
  685. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_eagle.py +0 -0
  686. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_moe.py +0 -0
  687. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_rm.py +0 -0
  688. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen3.py +0 -0
  689. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/qwen3_moe.py +0 -0
  690. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/registry.py +0 -0
  691. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/roberta.py +0 -0
  692. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/siglip.py +0 -0
  693. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/stablelm.py +0 -0
  694. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/torch_native_llama.py +0 -0
  695. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/transformers.py +0 -0
  696. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/xverse.py +0 -0
  697. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/xverse_moe.py +0 -0
  698. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/models/yivl.py +0 -0
  699. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/clip.py +0 -0
  700. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/llava.py +0 -0
  701. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/mlama.py +0 -0
  702. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/operations.py +0 -0
  703. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/operations_strategy.py +0 -0
  704. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/patch_torch.py +0 -0
  705. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/reasoning_parser.py +0 -0
  706. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  707. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  708. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  709. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  710. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  711. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  712. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  713. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/sampling/sampling_params.py +0 -0
  714. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  715. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
  716. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +0 -0
  717. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/speculative/eagle_utils.py +0 -0
  718. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/speculative/eagle_worker.py +0 -0
  719. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/speculative/spec_info.py +0 -0
  720. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  721. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/warmup.py +0 -0
  722. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/__init__.py +0 -0
  723. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/attention/__init__.py +0 -0
  724. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/attention/test_flashattn_backend.py +0 -0
  725. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
  726. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
  727. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/few_shot_gsm8k.py +0 -0
  728. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  729. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/run_eval.py +0 -0
  730. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/runners.py +0 -0
  731. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/send_one.py +0 -0
  732. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/simple_eval_common.py +0 -0
  733. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  734. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  735. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/simple_eval_math.py +0 -0
  736. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  737. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  738. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_activation.py +0 -0
  739. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_block_fp8.py +0 -0
  740. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -0
  741. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_block_fp8_ep.py +0 -0
  742. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_custom_ops.py +0 -0
  743. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_cutlass_moe.py +0 -0
  744. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_deepep_utils.py +0 -0
  745. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_dynamic_grad_mode.py +0 -0
  746. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_fp4_moe.py +0 -0
  747. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_layernorm.py +0 -0
  748. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_programs.py +0 -0
  749. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang/test/test_utils.py +0 -0
  750. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang.egg-info/dependency_links.txt +0 -0
  751. {sglang-0.4.9 → sglang-0.4.9.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.9
3
+ Version: 0.4.9.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,8 +239,10 @@ Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
239
239
  Requires-Dist: psutil; extra == "runtime-common"
240
240
  Requires-Dist: pydantic; extra == "runtime-common"
241
241
  Requires-Dist: pynvml; extra == "runtime-common"
242
+ Requires-Dist: pybase64; extra == "runtime-common"
242
243
  Requires-Dist: python-multipart; extra == "runtime-common"
243
244
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
245
+ Requires-Dist: sentencepiece; extra == "runtime-common"
244
246
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
245
247
  Requires-Dist: scipy; extra == "runtime-common"
246
248
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
@@ -248,10 +250,10 @@ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
248
250
  Requires-Dist: timm==1.0.16; extra == "runtime-common"
249
251
  Requires-Dist: uvicorn; extra == "runtime-common"
250
252
  Requires-Dist: uvloop; extra == "runtime-common"
251
- Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
253
+ Requires-Dist: xgrammar==0.1.21; extra == "runtime-common"
252
254
  Provides-Extra: srt
253
255
  Requires-Dist: sglang[runtime_common]; extra == "srt"
254
- Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
256
+ Requires-Dist: sgl-kernel==0.2.5; extra == "srt"
255
257
  Requires-Dist: torch==2.7.1; extra == "srt"
256
258
  Requires-Dist: torchaudio==2.7.1; extra == "srt"
257
259
  Requires-Dist: torchvision==0.22.1; extra == "srt"
@@ -419,7 +421,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
419
421
  [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
420
422
 
421
423
  ## Adoption and Sponsorship
422
- SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
424
+ SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
423
425
 
424
426
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
425
427
 
@@ -65,7 +65,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
65
65
  [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
66
66
 
67
67
  ## Adoption and Sponsorship
68
- SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
68
+ SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
69
69
 
70
70
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
71
71
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.9"
7
+ version = "0.4.9.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -38,8 +38,10 @@ runtime_common = [
38
38
  "psutil",
39
39
  "pydantic",
40
40
  "pynvml",
41
+ "pybase64",
41
42
  "python-multipart",
42
43
  "pyzmq>=25.1.2",
44
+ "sentencepiece",
43
45
  "soundfile==0.13.1",
44
46
  "scipy",
45
47
  "torchao==0.9.0",
@@ -47,12 +49,12 @@ runtime_common = [
47
49
  "timm==1.0.16",
48
50
  "uvicorn",
49
51
  "uvloop",
50
- "xgrammar==0.1.19",
52
+ "xgrammar==0.1.21",
51
53
  ]
52
54
 
53
55
  srt = [
54
56
  "sglang[runtime_common]",
55
- "sgl-kernel==0.2.4",
57
+ "sgl-kernel==0.2.5",
56
58
  "torch==2.7.1",
57
59
  "torchaudio==2.7.1",
58
60
  "torchvision==0.22.1",
@@ -814,9 +814,9 @@ def sample_mmmu_requests(
814
814
  List of tuples (prompt, prompt_token_len, output_token_len).
815
815
  """
816
816
  try:
817
- import base64
818
817
  import io
819
818
 
819
+ import pybase64
820
820
  from datasets import load_dataset
821
821
  except ImportError:
822
822
  raise ImportError("Please install datasets: pip install datasets")
@@ -867,7 +867,7 @@ def sample_mmmu_requests(
867
867
  # Encode image to base64
868
868
  buffered = io.BytesIO()
869
869
  image.save(buffered, format="JPEG")
870
- img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
870
+ img_str = pybase64.b64encode(buffered.getvalue()).decode("utf-8")
871
871
  image_data = f"data:image/jpeg;base64,{img_str}"
872
872
  else:
873
873
  continue
@@ -25,6 +25,7 @@ from transformers import PretrainedConfig
25
25
  from sglang.srt.hf_transformers_utils import (
26
26
  get_config,
27
27
  get_context_length,
28
+ get_generation_config,
28
29
  get_hf_text_config,
29
30
  )
30
31
  from sglang.srt.layers.quantization import QUANTIZATION_METHODS
@@ -83,6 +84,13 @@ class ModelConfig:
83
84
  **kwargs,
84
85
  )
85
86
 
87
+ self.hf_generation_config = get_generation_config(
88
+ self.model_path,
89
+ trust_remote_code=trust_remote_code,
90
+ revision=revision,
91
+ **kwargs,
92
+ )
93
+
86
94
  self.hf_text_config = get_hf_text_config(self.hf_config)
87
95
  self.attention_chunk_size = getattr(
88
96
  self.hf_text_config, "attention_chunk_size", None
@@ -359,7 +367,17 @@ class ModelConfig:
359
367
  if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
360
368
  quant_cfg = modelopt_quant_config
361
369
  elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
362
- quant_cfg = modelopt_quant_config
370
+ quant_config_file = os.path.join(
371
+ self.model_path, "hf_quant_config.json"
372
+ )
373
+ with open(quant_config_file) as f:
374
+ quant_config_dict = json.load(f)
375
+ json_quant_configs = quant_config_dict["quantization"]
376
+ quant_algo = json_quant_configs.get("quant_algo", None)
377
+ if quant_algo == "MIXED_PRECISION":
378
+ quant_cfg = {"quant_method": "w4afp8"}
379
+ else:
380
+ quant_cfg = modelopt_quant_config
363
381
  return quant_cfg
364
382
 
365
383
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -389,6 +407,7 @@ class ModelConfig:
389
407
  "w8a8_fp8",
390
408
  "moe_wna16",
391
409
  "qoq",
410
+ "w4afp8",
392
411
  ]
393
412
  compatible_quantization_methods = {
394
413
  "modelopt_fp4": ["modelopt"],
@@ -402,7 +421,9 @@ class ModelConfig:
402
421
  quant_cfg = self._parse_quant_hf_config()
403
422
 
404
423
  if quant_cfg is not None:
405
- quant_method = quant_cfg.get("quant_method", "").lower()
424
+ quant_method = quant_cfg.get(
425
+ "quant_method", "" if not self.quantization else self.quantization
426
+ ).lower()
406
427
 
407
428
  # Detect which checkpoint is it
408
429
  for _, method in QUANTIZATION_METHODS.items():
@@ -454,6 +475,19 @@ class ModelConfig:
454
475
  if eos_ids:
455
476
  # it can be either int or list of int
456
477
  eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
478
+ if eos_ids is None:
479
+ eos_ids = set()
480
+ if self.hf_generation_config:
481
+ generation_eos_ids = getattr(
482
+ self.hf_generation_config, "eos_token_id", None
483
+ )
484
+ if generation_eos_ids:
485
+ generation_eos_ids = (
486
+ {generation_eos_ids}
487
+ if isinstance(generation_eos_ids, int)
488
+ else set(generation_eos_ids)
489
+ )
490
+ eos_ids = eos_ids | generation_eos_ids
457
491
  return eos_ids
458
492
 
459
493
  def maybe_pull_model_tokenizer_from_remote(self) -> None:
@@ -88,9 +88,11 @@ class Conversation:
88
88
  stop_str: Union[str, List[str]] = None
89
89
  # The string that represents an image token in the prompt
90
90
  image_token: str = "<image>"
91
+ video_token: str = "<video>"
91
92
  audio_token: str = "<audio>"
92
93
 
93
94
  image_data: Optional[List[str]] = None
95
+ video_data: Optional[List[str]] = None
94
96
  modalities: Optional[List[str]] = None
95
97
  stop_token_ids: Optional[int] = None
96
98
 
@@ -380,11 +382,15 @@ class Conversation:
380
382
  self.messages.append([role, message])
381
383
 
382
384
  def append_image(self, image: str):
383
- """Append a new message."""
385
+ """Append a new image."""
384
386
  self.image_data.append(image)
385
387
 
388
+ def append_video(self, video: str):
389
+ """Append a new video."""
390
+ self.video_data.append(video)
391
+
386
392
  def append_audio(self, audio: str):
387
- """Append a new message."""
393
+ """Append a new audio."""
388
394
  self.audio_data.append(audio)
389
395
 
390
396
  def update_last_message(self, message: str):
@@ -433,6 +439,7 @@ class Conversation:
433
439
  sep2=self.sep2,
434
440
  stop_str=self.stop_str,
435
441
  image_token=self.image_token,
442
+ video_token=self.video_token,
436
443
  audio_token=self.audio_token,
437
444
  )
438
445
 
@@ -495,8 +502,12 @@ def generate_embedding_convs(
495
502
  sep2=conv_template.sep2,
496
503
  stop_str=conv_template.stop_str,
497
504
  image_data=[],
505
+ video_data=[],
506
+ audio_data=[],
498
507
  modalities=[],
499
508
  image_token=conv_template.image_token,
509
+ video_token=conv_template.video_token,
510
+ audio_token=conv_template.audio_token,
500
511
  )
501
512
  real_content = ""
502
513
 
@@ -557,10 +568,12 @@ def generate_chat_conv(
557
568
  sep2=conv.sep2,
558
569
  stop_str=conv.stop_str,
559
570
  image_data=[],
571
+ video_data=[],
560
572
  audio_data=[],
561
573
  modalities=[],
562
574
  image_token=conv.image_token,
563
575
  audio_token=conv.audio_token,
576
+ video_token=conv.video_token,
564
577
  )
565
578
 
566
579
  if isinstance(request.messages, str):
@@ -602,6 +615,7 @@ def generate_chat_conv(
602
615
  image_token = ""
603
616
 
604
617
  audio_token = conv.audio_token
618
+ video_token = conv.video_token
605
619
  for content in message.content:
606
620
  if content.type == "text":
607
621
  if num_image_url > 16:
@@ -614,6 +628,9 @@ def generate_chat_conv(
614
628
  else:
615
629
  real_content += image_token
616
630
  conv.append_image(content.image_url.url)
631
+ elif content.type == "video_url":
632
+ real_content += video_token
633
+ conv.append_video(content.video_url.url)
617
634
  elif content.type == "audio_url":
618
635
  real_content += audio_token
619
636
  conv.append_audio(content.audio_url.url)
@@ -810,6 +827,7 @@ register_conv_template(
810
827
  sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
811
828
  stop_str=["<|im_end|>"],
812
829
  image_token="<|vision_start|><|image_pad|><|vision_end|>",
830
+ video_token="<|vision_start|><|video_pad|><|vision_end|>",
813
831
  )
814
832
  )
815
833
 
@@ -870,6 +888,7 @@ register_conv_template(
870
888
  sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
871
889
  stop_str=("<|im_end|>", "<|endoftext|>"),
872
890
  image_token="(<image>./</image>)",
891
+ video_token="(<video>./</video>)",
873
892
  )
874
893
  )
875
894
 
@@ -921,6 +940,19 @@ register_conv_template(
921
940
  )
922
941
  )
923
942
 
943
+ register_conv_template(
944
+ Conversation(
945
+ name="mimo-vl",
946
+ system_message="You are MiMo, an AI assistant developed by Xiaomi.",
947
+ system_template="<|im_start|>system\n{system_message}",
948
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
949
+ sep="<|im_end|>\n",
950
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
951
+ stop_str=["<|im_end|>"],
952
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
953
+ )
954
+ )
955
+
924
956
 
925
957
  register_conv_template(
926
958
  Conversation(
@@ -935,6 +967,19 @@ register_conv_template(
935
967
  )
936
968
  )
937
969
 
970
+ register_conv_template(
971
+ Conversation(
972
+ name="llama_4_vision",
973
+ system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
974
+ system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
975
+ roles=("user", "assistant"),
976
+ sep_style=SeparatorStyle.LLAMA4,
977
+ sep="",
978
+ stop_str="<|eot|>",
979
+ image_token="<|image|>",
980
+ )
981
+ )
982
+
938
983
 
939
984
  @register_conv_template_matching_function
940
985
  def match_internvl(model_path: str):
@@ -943,9 +988,11 @@ def match_internvl(model_path: str):
943
988
 
944
989
 
945
990
  @register_conv_template_matching_function
946
- def match_llama_3_vision(model_path: str):
991
+ def match_llama_vision(model_path: str):
947
992
  if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
948
993
  return "llama_3_vision"
994
+ if re.search(r"llama.*4.*", model_path, re.IGNORECASE):
995
+ return "llama_4_vision"
949
996
 
950
997
 
951
998
  @register_conv_template_matching_function
@@ -1034,3 +1081,9 @@ def match_phi_4_mm(model_path: str):
1034
1081
  def match_vila(model_path: str):
1035
1082
  if re.search(r"vila", model_path, re.IGNORECASE):
1036
1083
  return "chatml"
1084
+
1085
+
1086
+ @register_conv_template_matching_function
1087
+ def match_mimo_vl(model_path: str):
1088
+ if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
1089
+ return "mimo-vl"
@@ -0,0 +1,6 @@
1
+ from sglang.srt.disaggregation.ascend.conn import (
2
+ AscendKVBootstrapServer,
3
+ AscendKVManager,
4
+ AscendKVReceiver,
5
+ AscendKVSender,
6
+ )
@@ -0,0 +1,44 @@
1
+ import logging
2
+
3
+ from sglang.srt.disaggregation.ascend.transfer_engine import AscendTransferEngine
4
+ from sglang.srt.disaggregation.mooncake.conn import (
5
+ MooncakeKVBootstrapServer,
6
+ MooncakeKVManager,
7
+ MooncakeKVReceiver,
8
+ MooncakeKVSender,
9
+ )
10
+ from sglang.srt.utils import get_local_ip_by_remote
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class AscendKVManager(MooncakeKVManager):
16
+ def init_engine(self):
17
+ # TransferEngine initialized on ascend.
18
+ local_ip = get_local_ip_by_remote()
19
+ self.engine = AscendTransferEngine(
20
+ hostname=local_ip,
21
+ npu_id=self.kv_args.gpu_id,
22
+ disaggregation_mode=self.disaggregation_mode,
23
+ )
24
+
25
+ def register_buffer_to_engine(self):
26
+ self.engine.register(
27
+ self.kv_args.kv_data_ptrs[0], sum(self.kv_args.kv_data_lens)
28
+ )
29
+ # The Ascend backend optimize batch registration for small memory blocks.
30
+ self.engine.batch_register(
31
+ self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
32
+ )
33
+
34
+
35
+ class AscendKVSender(MooncakeKVSender):
36
+ pass
37
+
38
+
39
+ class AscendKVReceiver(MooncakeKVReceiver):
40
+ pass
41
+
42
+
43
+ class AscendKVBootstrapServer(MooncakeKVBootstrapServer):
44
+ pass
@@ -0,0 +1,58 @@
1
+ import logging
2
+ import os
3
+ from typing import List, Optional
4
+
5
+ from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
6
+ from sglang.srt.disaggregation.utils import DisaggregationMode
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class AscendTransferEngine(MooncakeTransferEngine):
12
+
13
+ def __init__(
14
+ self, hostname: str, npu_id: int, disaggregation_mode: DisaggregationMode
15
+ ):
16
+ try:
17
+ from mf_adapter import TransferEngine
18
+ except ImportError as e:
19
+ raise ImportError(
20
+ "Please install mf_adapter, for details, see docs/backend/pd_disaggregation.md"
21
+ ) from e
22
+
23
+ self.engine = TransferEngine()
24
+ self.hostname = hostname
25
+ self.npu_id = npu_id
26
+
27
+ # Centralized storage address of the AscendTransferEngine
28
+ self.store_url = os.getenv("ASCEND_MF_STORE_URL")
29
+ if disaggregation_mode == DisaggregationMode.PREFILL:
30
+ self.role = "Prefill"
31
+ elif disaggregation_mode == DisaggregationMode.DECODE:
32
+ self.role = "Decode"
33
+ else:
34
+ logger.error(f"Unsupported DisaggregationMode: {disaggregation_mode}")
35
+ raise ValueError(f"Unsupported DisaggregationMode: {disaggregation_mode}")
36
+ self.session_id = f"{self.hostname}:{self.engine.get_rpc_port()}"
37
+ self.initialize()
38
+
39
+ def initialize(self) -> None:
40
+ """Initialize the ascend transfer instance."""
41
+ ret_value = self.engine.initialize(
42
+ self.store_url,
43
+ self.session_id,
44
+ self.role,
45
+ self.npu_id,
46
+ )
47
+ if ret_value != 0:
48
+ logger.error("Ascend Transfer Engine initialization failed.")
49
+ raise RuntimeError("Ascend Transfer Engine initialization failed.")
50
+
51
+ def batch_register(self, ptrs: List[int], lengths: List[int]):
52
+ try:
53
+ ret_value = self.engine.batch_register_memory(ptrs, lengths)
54
+ except Exception:
55
+ # Mark register as failed
56
+ ret_value = -1
57
+ if ret_value != 0:
58
+ logger.debug(f"Ascend memory registration for ptr {ptrs} failed.")
@@ -132,13 +132,9 @@ class MooncakeKVManager(BaseKVManager):
132
132
  ):
133
133
  self.kv_args = args
134
134
  self.local_ip = get_local_ip_auto()
135
- self.engine = MooncakeTransferEngine(
136
- hostname=self.local_ip,
137
- gpu_id=self.kv_args.gpu_id,
138
- ib_device=self.kv_args.ib_device,
139
- )
140
135
  self.is_mla_backend = is_mla_backend
141
136
  self.disaggregation_mode = disaggregation_mode
137
+ self.init_engine()
142
138
  # for p/d multi node infer
143
139
  self.bootstrap_port = server_args.disaggregation_bootstrap_port
144
140
  self.dist_init_addr = server_args.dist_init_addr
@@ -185,9 +181,11 @@ class MooncakeKVManager(BaseKVManager):
185
181
  threading.Thread(
186
182
  target=self.transfer_worker, args=(queue, executor), daemon=True
187
183
  ).start()
188
-
189
- self.bootstrap_time_out = get_int_env_var(
190
- "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 120
184
+ # If a timeout happens on the prefill side, it means prefill instances
185
+ # fail to receive the KV indices from the decode instance of this request.
186
+ # These timeout requests should be aborted to release the tree cache.
187
+ self.bootstrap_timeout = get_int_env_var(
188
+ "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 300
191
189
  )
192
190
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
193
191
  self.heartbeat_failures = {}
@@ -209,6 +207,12 @@ class MooncakeKVManager(BaseKVManager):
209
207
  self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
210
208
  self.prefill_tp_size_table: Dict[str, int] = {}
211
209
  self.prefill_dp_size_table: Dict[str, int] = {}
210
+ # If a timeout happens on the decode side, it means decode instances
211
+ # fail to receive the KV Cache transfer done signal after bootstrapping.
212
+ # These timeout requests should be aborted to release the tree cache.
213
+ self.waiting_timeout = get_int_env_var(
214
+ "SGLANG_DISAGGREGATION_WAITING_TIMEOUT", 300
215
+ )
212
216
  else:
213
217
  raise ValueError(
214
218
  f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
@@ -217,6 +221,13 @@ class MooncakeKVManager(BaseKVManager):
217
221
  self.failure_records: Dict[int, str] = {}
218
222
  self.failure_lock = threading.Lock()
219
223
 
224
+ def init_engine(self):
225
+ self.engine = MooncakeTransferEngine(
226
+ hostname=self.local_ip,
227
+ gpu_id=self.kv_args.gpu_id,
228
+ ib_device=self.kv_args.ib_device,
229
+ )
230
+
220
231
  def register_buffer_to_engine(self):
221
232
  for kv_data_ptr, kv_data_len in zip(
222
233
  self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
@@ -259,19 +270,17 @@ class MooncakeKVManager(BaseKVManager):
259
270
 
260
271
  # Worker function for processing a single layer
261
272
  def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
262
- src_addr_list = []
263
- dst_addr_list = []
264
- length_list = []
265
273
  for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
266
274
  src_addr = src_ptr + int(prefill_index[0]) * item_len
267
275
  dst_addr = dst_ptr + int(decode_index[0]) * item_len
268
276
  length = item_len * len(prefill_index)
269
- src_addr_list.append(src_addr)
270
- dst_addr_list.append(dst_addr)
271
- length_list.append(length)
272
- return self.engine.batch_transfer_sync(
273
- mooncake_session_id, src_addr_list, dst_addr_list, length_list
274
- )
277
+
278
+ status = self.engine.transfer_sync(
279
+ mooncake_session_id, src_addr, dst_addr, length
280
+ )
281
+ if status != 0:
282
+ return status
283
+ return 0
275
284
 
276
285
  futures = [
277
286
  executor.submit(
@@ -938,7 +947,12 @@ class MooncakeKVSender(BaseKVSender):
938
947
  if self.init_time is not None:
939
948
  now = time.time()
940
949
  elapsed = now - self.init_time
941
- if elapsed >= self.kv_mgr.bootstrap_time_out:
950
+ if elapsed >= self.kv_mgr.bootstrap_timeout:
951
+ logger.warning_once(
952
+ "Some requests timed out when bootstrapping, "
953
+ "which means prefill instances fail to receive the KV indices from the decode instance of this request. "
954
+ "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
955
+ )
942
956
  self.kv_mgr.record_failure(
943
957
  self.bootstrap_room,
944
958
  f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
@@ -987,6 +1001,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
987
1001
  self.session_id = self.kv_mgr.get_session_id()
988
1002
  self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
989
1003
  self.conclude_state = None
1004
+ self.init_time = None
990
1005
  self.data_parallel_rank = data_parallel_rank
991
1006
 
992
1007
  if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
@@ -1222,14 +1237,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
1222
1237
  str(self.required_dst_info_num).encode("ascii"),
1223
1238
  ]
1224
1239
  )
1240
+ self.init_time = time.time()
1225
1241
 
1226
1242
  def poll(self) -> KVPoll:
1227
1243
  if self.conclude_state is None:
1228
1244
  status = self.kv_mgr.check_status(self.bootstrap_room)
1229
1245
  if status in (KVPoll.Success, KVPoll.Failed):
1230
1246
  self.conclude_state = status
1247
+ elif status == KVPoll.WaitingForInput:
1248
+ if self.init_time is not None:
1249
+ now = time.time()
1250
+ elapsed = now - self.init_time
1251
+ if elapsed >= self.kv_mgr.waiting_timeout:
1252
+ logger.warning_once(
1253
+ "Some requests fail to receive KV Cache transfer done signal after bootstrapping. "
1254
+ "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
1255
+ )
1256
+ self.kv_mgr.record_failure(
1257
+ self.bootstrap_room,
1258
+ f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.WaitingForInput",
1259
+ )
1260
+ self.conclude_state = KVPoll.Failed
1261
+ return KVPoll.Failed
1231
1262
 
1232
1263
  return status
1264
+
1233
1265
  else:
1234
1266
  return self.conclude_state
1235
1267
 
@@ -1,8 +1,8 @@
1
- import json
2
1
  import logging
3
- from dataclasses import dataclass
4
2
  from typing import List, Optional
5
3
 
4
+ from sglang.srt.utils import get_bool_env_var, get_free_port
5
+
6
6
  logger = logging.getLogger(__name__)
7
7
 
8
8
 
@@ -55,12 +55,21 @@ class MooncakeTransferEngine:
55
55
  device_name: Optional[str],
56
56
  ) -> None:
57
57
  """Initialize the mooncake instance."""
58
- ret_value = self.engine.initialize(
59
- hostname,
60
- "P2PHANDSHAKE",
61
- "rdma",
62
- device_name if device_name is not None else "",
63
- )
58
+ if get_bool_env_var("ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE", "false"):
59
+ hostname += f":{get_free_port()}:npu_{self.gpu_id}"
60
+ ret_value = self.engine.initialize(
61
+ hostname,
62
+ "P2PHANDSHAKE",
63
+ "ascend",
64
+ device_name if device_name is not None else "",
65
+ )
66
+ else:
67
+ ret_value = self.engine.initialize(
68
+ hostname,
69
+ "P2PHANDSHAKE",
70
+ "rdma",
71
+ device_name if device_name is not None else "",
72
+ )
64
73
  if ret_value != 0:
65
74
  logger.error("Mooncake Transfer Engine initialization failed.")
66
75
  raise RuntimeError("Mooncake Transfer Engine initialization failed.")