sglang 0.4.9.post1__tar.gz → 0.4.9.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (750) hide show
  1. {sglang-0.4.9.post1/sglang.egg-info → sglang-0.4.9.post2}/PKG-INFO +4 -3
  2. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/pyproject.toml +4 -3
  3. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/model_config.py +24 -1
  4. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/conversation.py +21 -2
  5. sglang-0.4.9.post2/sglang/srt/disaggregation/ascend/__init__.py +6 -0
  6. sglang-0.4.9.post2/sglang/srt/disaggregation/ascend/conn.py +44 -0
  7. sglang-0.4.9.post2/sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
  8. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mooncake/conn.py +15 -14
  9. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
  10. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/utils.py +25 -3
  11. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/engine.py +1 -1
  12. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/http_server.py +1 -0
  13. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/protocol.py +11 -0
  14. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_chat.py +7 -0
  15. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/function_call_parser.py +2 -0
  16. sglang-0.4.9.post2/sglang/srt/function_call/kimik2_detector.py +220 -0
  17. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/hf_transformers_utils.py +18 -0
  18. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/jinja_template_utils.py +8 -0
  19. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/communicator.py +17 -4
  20. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/linear.py +12 -2
  21. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
  22. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/ep_moe/layer.py +2 -1
  23. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -2
  24. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/topk.py +8 -2
  25. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/parameter.py +19 -3
  26. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  27. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/moe_wna16.py +1 -2
  28. sglang-0.4.9.post2/sglang/srt/layers/quantization/w8a8_int8.py +1047 -0
  29. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/io_struct.py +27 -2
  30. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/mm_utils.py +55 -94
  31. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/schedule_batch.py +16 -5
  32. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/scheduler.py +21 -1
  33. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/tokenizer_manager.py +16 -0
  34. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/mem_cache/memory_pool.py +65 -40
  35. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/model_executor/forward_batch_info.py +13 -1
  36. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/model_loader/loader.py +23 -12
  37. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/deepseek_janus_pro.py +1 -1
  38. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/deepseek_v2.py +62 -17
  39. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/deepseek_vl2.py +1 -1
  40. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gemma3_mm.py +1 -1
  41. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gemma3n_mm.py +6 -3
  42. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/internvl.py +8 -2
  43. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/kimi_vl.py +8 -2
  44. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/llama.py +2 -0
  45. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/llava.py +3 -1
  46. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/llavavid.py +1 -1
  47. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/minicpmo.py +1 -2
  48. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/minicpmv.py +1 -1
  49. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/mixtral_quant.py +4 -0
  50. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/mllama4.py +13 -4
  51. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/phi4mm.py +8 -2
  52. sglang-0.4.9.post2/sglang/srt/models/phimoe.py +553 -0
  53. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen2.py +2 -0
  54. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_5_vl.py +10 -7
  55. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_vl.py +12 -1
  56. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/vila.py +8 -2
  57. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/base_processor.py +197 -137
  58. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
  59. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/gemma3.py +4 -2
  60. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/gemma3n.py +1 -1
  61. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/internvl.py +1 -1
  62. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/janus_pro.py +1 -1
  63. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/kimi_vl.py +1 -1
  64. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/minicpm.py +4 -3
  65. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/mllama4.py +1 -1
  66. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/phi4mm.py +1 -1
  67. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/pixtral.py +1 -1
  68. sglang-0.4.9.post2/sglang/srt/multimodal/processors/qwen_vl.py +286 -0
  69. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/vila.py +1 -1
  70. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/server_args.py +11 -4
  71. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/utils.py +154 -31
  72. sglang-0.4.9.post2/sglang/version.py +1 -0
  73. {sglang-0.4.9.post1 → sglang-0.4.9.post2/sglang.egg-info}/PKG-INFO +4 -3
  74. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang.egg-info/SOURCES.txt +5 -0
  75. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang.egg-info/requires.txt +3 -2
  76. sglang-0.4.9.post1/sglang/srt/layers/quantization/w8a8_int8.py +0 -323
  77. sglang-0.4.9.post1/sglang/srt/multimodal/processors/qwen_vl.py +0 -163
  78. sglang-0.4.9.post1/sglang/version.py +0 -1
  79. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/LICENSE +0 -0
  80. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/README.md +0 -0
  81. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/setup.cfg +0 -0
  82. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/__init__.py +0 -0
  83. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/api.py +0 -0
  84. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/bench_offline_throughput.py +0 -0
  85. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/bench_one_batch.py +0 -0
  86. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/bench_one_batch_server.py +0 -0
  87. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/bench_serving.py +0 -0
  88. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/check_env.py +0 -0
  89. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/compile_deep_gemm.py +0 -0
  90. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/eval/llama3_eval.py +0 -0
  91. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/eval/loogle_eval.py +0 -0
  92. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/global_config.py +0 -0
  93. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/backend/__init__.py +0 -0
  94. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/backend/anthropic.py +0 -0
  95. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/backend/base_backend.py +0 -0
  96. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/backend/litellm.py +0 -0
  97. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/backend/openai.py +0 -0
  98. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
  99. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/backend/vertexai.py +0 -0
  100. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/chat_template.py +0 -0
  101. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/choices.py +0 -0
  102. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/compiler.py +0 -0
  103. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/interpreter.py +0 -0
  104. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/ir.py +0 -0
  105. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/lang/tracer.py +0 -0
  106. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/launch_server.py +0 -0
  107. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/profiler.py +0 -0
  108. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/_custom_ops.py +0 -0
  109. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/aio_rwlock.py +0 -0
  110. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/code_completion_parser.py +0 -0
  111. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/__init__.py +0 -0
  112. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/chatglm.py +0 -0
  113. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/dbrx.py +0 -0
  114. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/deepseekvl2.py +0 -0
  115. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/device_config.py +0 -0
  116. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/exaone.py +0 -0
  117. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/internvl.py +0 -0
  118. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/janus_pro.py +0 -0
  119. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/kimi_vl.py +0 -0
  120. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/kimi_vl_moonvit.py +0 -0
  121. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/load_config.py +0 -0
  122. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/update_config.py +0 -0
  123. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/configs/utils.py +0 -0
  124. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/connector/__init__.py +0 -0
  125. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/connector/base_connector.py +0 -0
  126. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/connector/redis.py +0 -0
  127. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/connector/s3.py +0 -0
  128. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/connector/serde/__init__.py +0 -0
  129. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/connector/serde/safe_serde.py +0 -0
  130. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/connector/serde/serde.py +0 -0
  131. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/connector/utils.py +0 -0
  132. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/constants.py +0 -0
  133. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  134. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/constrained/llguidance_backend.py +0 -0
  135. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/constrained/outlines_backend.py +0 -0
  136. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  137. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
  138. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
  139. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  140. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/custom_op.py +0 -0
  141. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/debug_utils.py +0 -0
  142. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/base/__init__.py +0 -0
  143. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/base/conn.py +0 -0
  144. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/common/__init__.py +0 -0
  145. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/common/conn.py +0 -0
  146. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/common/utils.py +0 -0
  147. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/decode.py +0 -0
  148. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/decode_schedule_batch_mixin.py +0 -0
  149. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/fake/__init__.py +0 -0
  150. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/fake/conn.py +0 -0
  151. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/kv_events.py +0 -0
  152. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/launch_lb.py +0 -0
  153. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mini_lb.py +0 -0
  154. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
  155. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
  156. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/nixl/conn.py +0 -0
  157. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/disaggregation/prefill.py +0 -0
  158. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/__init__.py +0 -0
  159. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/communication_op.py +0 -0
  160. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  161. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  162. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  163. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  164. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/npu_communicator.py +0 -0
  165. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/pymscclpp.py +0 -0
  166. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  167. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  168. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  169. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  170. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/parallel_state.py +0 -0
  171. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/distributed/utils.py +0 -0
  172. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/EngineBase.py +0 -0
  173. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/http_server_engine.py +0 -0
  174. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/__init__.py +0 -0
  175. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_base.py +0 -0
  176. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_completions.py +0 -0
  177. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_embedding.py +0 -0
  178. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_rerank.py +0 -0
  179. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/serving_score.py +0 -0
  180. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/usage_processor.py +0 -0
  181. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/entrypoints/openai/utils.py +0 -0
  182. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/__init__.py +0 -0
  183. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_algorithms/__init__.py +0 -0
  184. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -0
  185. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py +0 -0
  186. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_manager.py +0 -0
  187. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_simulator/__init__.py +0 -0
  188. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/eplb_simulator/reader.py +0 -0
  189. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/expert_distribution.py +0 -0
  190. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/expert_location.py +0 -0
  191. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/expert_location_dispatch.py +0 -0
  192. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/eplb/expert_location_updater.py +0 -0
  193. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/base_format_detector.py +0 -0
  194. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/core_types.py +0 -0
  195. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/deepseekv3_detector.py +0 -0
  196. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/ebnf_composer.py +0 -0
  197. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/llama32_detector.py +0 -0
  198. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/mistral_detector.py +0 -0
  199. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/pythonic_detector.py +0 -0
  200. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/qwen25_detector.py +0 -0
  201. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/function_call/utils.py +0 -0
  202. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/activation.py +0 -0
  203. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/amx_utils.py +0 -0
  204. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/aiter_backend.py +0 -0
  205. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/ascend_backend.py +0 -0
  206. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  207. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/cutlass_mla_backend.py +0 -0
  208. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  209. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/flashattention_backend.py +0 -0
  210. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  211. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  212. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
  213. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/intel_amx_backend.py +0 -0
  214. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/merge_state.py +0 -0
  215. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/tbo_backend.py +0 -0
  216. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  217. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_backend.py +0 -0
  218. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  219. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  220. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  221. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/merge_state.py +0 -0
  222. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  223. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  224. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/utils.py +0 -0
  225. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/attention/vision.py +0 -0
  226. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/dp_attention.py +0 -0
  227. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/elementwise.py +0 -0
  228. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/flashinfer_comm_fusion.py +0 -0
  229. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/layernorm.py +0 -0
  230. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/logits_processor.py +0 -0
  231. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/cutlass_moe.py +0 -0
  232. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/cutlass_moe_params.py +0 -0
  233. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/cutlass_w4a8_moe.py +0 -0
  234. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  235. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +0 -0
  236. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  237. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  238. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  239. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  240. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  241. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  242. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  243. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  244. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  245. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  246. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  247. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  248. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  249. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  250. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  251. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  252. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  253. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  254. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  255. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  256. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  257. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  258. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  259. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  260. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  261. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  262. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  263. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  264. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  265. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  266. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  267. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  268. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  269. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  270. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  271. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  272. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  273. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  274. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  275. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  276. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  277. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  278. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  279. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  280. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  281. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  282. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  283. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  284. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  285. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  286. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  287. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  288. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  289. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  290. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  291. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  292. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  293. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  294. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  295. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  296. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  297. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  298. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  299. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  300. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  301. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  302. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  303. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  304. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  305. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  306. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  307. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  308. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  309. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  310. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  311. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  312. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  313. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  314. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  315. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  316. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  317. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  318. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  319. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  320. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  321. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  322. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  323. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  324. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  325. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  326. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  327. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  328. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  329. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  330. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  331. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  332. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  333. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  334. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  335. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  336. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  337. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  338. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  339. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  340. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  341. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  342. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  343. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  344. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  345. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  346. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  347. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  348. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  349. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  350. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  351. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  352. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  353. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  354. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  355. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  356. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  357. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  358. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  359. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  360. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  361. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  362. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  363. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  364. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  365. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  366. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  367. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json +0 -0
  368. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json +0 -0
  369. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  370. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json +0 -0
  372. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json +0 -0
  374. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  375. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  376. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  377. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json +0 -0
  379. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json +0 -0
  381. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json +0 -0
  382. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  388. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  392. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  395. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  396. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  397. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  400. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +0 -0
  401. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -0
  402. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/moe/router.py +0 -0
  403. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/multimodal.py +0 -0
  404. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/pooler.py +0 -0
  405. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/__init__.py +0 -0
  406. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/awq.py +0 -0
  407. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
  408. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
  409. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  410. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
  411. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  412. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  413. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  414. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
  415. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  416. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  417. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  488. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  489. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  490. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  491. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  492. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  493. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  494. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  495. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  496. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  497. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  498. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  499. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  500. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  501. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  502. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  503. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  504. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  505. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  506. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  507. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  508. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  509. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  510. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  511. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  512. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  513. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  514. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  515. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  516. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  517. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  518. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  519. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  520. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  521. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  522. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  523. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  524. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  525. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  526. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  527. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  528. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  529. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  530. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  531. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  532. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  533. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  534. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  535. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  536. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  537. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  538. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  539. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  540. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  541. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  542. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  543. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  544. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  545. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  546. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  547. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  548. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  549. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  550. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  551. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  552. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  553. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  554. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  555. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  556. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  557. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  558. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  559. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  560. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  561. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  562. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  563. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  564. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  565. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  566. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  567. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  568. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  569. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +0 -0
  570. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +0 -0
  571. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -0
  572. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +0 -0
  573. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/fp8.py +0 -0
  574. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  575. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/gptq.py +0 -0
  576. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  577. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  578. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/kv_cache.py +0 -0
  579. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  580. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/qoq.py +0 -0
  581. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/quant_utils.py +0 -0
  582. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/utils.py +0 -0
  583. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/w4afp8.py +0 -0
  584. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
  585. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/radix_attention.py +0 -0
  586. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/rotary_embedding.py +0 -0
  587. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/sampler.py +0 -0
  588. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  589. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/utils.py +0 -0
  590. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  591. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/backend/base_backend.py +0 -0
  592. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  593. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/backend/triton_backend.py +0 -0
  594. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/layers.py +0 -0
  595. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/lora.py +0 -0
  596. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/lora_config.py +0 -0
  597. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/lora_manager.py +0 -0
  598. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/mem_pool.py +0 -0
  599. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  600. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  601. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  602. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  603. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  604. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/lora/utils.py +0 -0
  605. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/cache_controller.py +0 -0
  606. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/configure_logging.py +0 -0
  607. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/data_parallel_controller.py +0 -0
  608. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/detokenizer_manager.py +0 -0
  609. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/multimodal_processor.py +0 -0
  610. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -0
  611. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/schedule_policy.py +0 -0
  612. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
  613. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/session_controller.py +0 -0
  614. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/template_manager.py +0 -0
  615. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/tp_worker.py +0 -0
  616. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  617. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/managers/utils.py +0 -0
  618. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/mem_cache/allocator.py +0 -0
  619. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  620. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  621. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  622. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
  623. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/mem_cache/memory_pool_host.py +0 -0
  624. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/mem_cache/multimodal_cache.py +0 -0
  625. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
  626. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/metrics/collector.py +0 -0
  627. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/metrics/func_timer.py +0 -0
  628. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
  629. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/model_executor/model_runner.py +0 -0
  630. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/model_loader/__init__.py +0 -0
  631. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/model_loader/utils.py +0 -0
  632. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/model_loader/weight_utils.py +0 -0
  633. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/model_parallel.py +0 -0
  634. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/baichuan.py +0 -0
  635. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/bert.py +0 -0
  636. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/chatglm.py +0 -0
  637. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/clip.py +0 -0
  638. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/commandr.py +0 -0
  639. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/dbrx.py +0 -0
  640. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/deepseek.py +0 -0
  641. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/deepseek_nextn.py +0 -0
  642. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/exaone.py +0 -0
  643. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gemma.py +0 -0
  644. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gemma2.py +0 -0
  645. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gemma2_reward.py +0 -0
  646. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gemma3_causal.py +0 -0
  647. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gemma3n_audio.py +0 -0
  648. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gemma3n_causal.py +0 -0
  649. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/glm4.py +0 -0
  650. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gpt2.py +0 -0
  651. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
  652. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/granite.py +0 -0
  653. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/grok.py +0 -0
  654. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/hunyuan.py +0 -0
  655. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/idefics2.py +0 -0
  656. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/internlm2.py +0 -0
  657. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/internlm2_reward.py +0 -0
  658. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/kimi_vl_moonvit.py +0 -0
  659. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/llama4.py +0 -0
  660. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/llama_classification.py +0 -0
  661. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/llama_eagle.py +0 -0
  662. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/llama_eagle3.py +0 -0
  663. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/llama_embedding.py +0 -0
  664. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/llama_reward.py +0 -0
  665. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/mimo.py +0 -0
  666. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/mimo_mtp.py +0 -0
  667. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/minicpm.py +0 -0
  668. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/minicpm3.py +0 -0
  669. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/mistral.py +0 -0
  670. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/mixtral.py +0 -0
  671. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/mllama.py +0 -0
  672. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/olmo.py +0 -0
  673. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/olmo2.py +0 -0
  674. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/olmoe.py +0 -0
  675. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/phi3_small.py +0 -0
  676. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/pixtral.py +0 -0
  677. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen.py +0 -0
  678. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_audio.py +0 -0
  679. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_classification.py +0 -0
  680. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_eagle.py +0 -0
  681. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_moe.py +0 -0
  682. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen2_rm.py +0 -0
  683. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen3.py +0 -0
  684. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/qwen3_moe.py +0 -0
  685. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/registry.py +0 -0
  686. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/roberta.py +0 -0
  687. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/siglip.py +0 -0
  688. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/stablelm.py +0 -0
  689. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/torch_native_llama.py +0 -0
  690. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/transformers.py +0 -0
  691. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/xverse.py +0 -0
  692. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/xverse_moe.py +0 -0
  693. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/models/yivl.py +0 -0
  694. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/mm_utils.py +0 -0
  695. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/clip.py +0 -0
  696. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/llava.py +0 -0
  697. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/multimodal/processors/mlama.py +0 -0
  698. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/operations.py +0 -0
  699. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/operations_strategy.py +0 -0
  700. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/patch_torch.py +0 -0
  701. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/reasoning_parser.py +0 -0
  702. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  703. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  704. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  705. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  706. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  707. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  708. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  709. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/sampling/sampling_params.py +0 -0
  710. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  711. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
  712. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +0 -0
  713. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/speculative/eagle_utils.py +0 -0
  714. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/speculative/eagle_worker.py +0 -0
  715. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/speculative/spec_info.py +0 -0
  716. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  717. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/two_batch_overlap.py +0 -0
  718. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/srt/warmup.py +0 -0
  719. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/__init__.py +0 -0
  720. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/attention/__init__.py +0 -0
  721. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/attention/test_flashattn_backend.py +0 -0
  722. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
  723. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
  724. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/few_shot_gsm8k.py +0 -0
  725. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  726. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/run_eval.py +0 -0
  727. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/runners.py +0 -0
  728. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/send_one.py +0 -0
  729. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/simple_eval_common.py +0 -0
  730. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  731. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  732. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/simple_eval_math.py +0 -0
  733. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  734. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  735. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_activation.py +0 -0
  736. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_block_fp8.py +0 -0
  737. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -0
  738. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_block_fp8_ep.py +0 -0
  739. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_custom_ops.py +0 -0
  740. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_cutlass_moe.py +0 -0
  741. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_cutlass_w4a8_moe.py +0 -0
  742. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_deepep_utils.py +0 -0
  743. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_dynamic_grad_mode.py +0 -0
  744. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_fp4_moe.py +0 -0
  745. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_layernorm.py +0 -0
  746. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_programs.py +0 -0
  747. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/test/test_utils.py +0 -0
  748. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang/utils.py +0 -0
  749. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang.egg-info/dependency_links.txt +0 -0
  750. {sglang-0.4.9.post1 → sglang-0.4.9.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.9.post1
3
+ Version: 0.4.9.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -242,6 +242,7 @@ Requires-Dist: pynvml; extra == "runtime-common"
242
242
  Requires-Dist: pybase64; extra == "runtime-common"
243
243
  Requires-Dist: python-multipart; extra == "runtime-common"
244
244
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
245
+ Requires-Dist: sentencepiece; extra == "runtime-common"
245
246
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
246
247
  Requires-Dist: scipy; extra == "runtime-common"
247
248
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
@@ -249,10 +250,10 @@ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
249
250
  Requires-Dist: timm==1.0.16; extra == "runtime-common"
250
251
  Requires-Dist: uvicorn; extra == "runtime-common"
251
252
  Requires-Dist: uvloop; extra == "runtime-common"
252
- Requires-Dist: xgrammar==0.1.20; extra == "runtime-common"
253
+ Requires-Dist: xgrammar==0.1.21; extra == "runtime-common"
253
254
  Provides-Extra: srt
254
255
  Requires-Dist: sglang[runtime_common]; extra == "srt"
255
- Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
256
+ Requires-Dist: sgl-kernel==0.2.5; extra == "srt"
256
257
  Requires-Dist: torch==2.7.1; extra == "srt"
257
258
  Requires-Dist: torchaudio==2.7.1; extra == "srt"
258
259
  Requires-Dist: torchvision==0.22.1; extra == "srt"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.9.post1"
7
+ version = "0.4.9.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -41,6 +41,7 @@ runtime_common = [
41
41
  "pybase64",
42
42
  "python-multipart",
43
43
  "pyzmq>=25.1.2",
44
+ "sentencepiece",
44
45
  "soundfile==0.13.1",
45
46
  "scipy",
46
47
  "torchao==0.9.0",
@@ -48,12 +49,12 @@ runtime_common = [
48
49
  "timm==1.0.16",
49
50
  "uvicorn",
50
51
  "uvloop",
51
- "xgrammar==0.1.20",
52
+ "xgrammar==0.1.21",
52
53
  ]
53
54
 
54
55
  srt = [
55
56
  "sglang[runtime_common]",
56
- "sgl-kernel==0.2.4",
57
+ "sgl-kernel==0.2.5",
57
58
  "torch==2.7.1",
58
59
  "torchaudio==2.7.1",
59
60
  "torchvision==0.22.1",
@@ -25,6 +25,7 @@ from transformers import PretrainedConfig
25
25
  from sglang.srt.hf_transformers_utils import (
26
26
  get_config,
27
27
  get_context_length,
28
+ get_generation_config,
28
29
  get_hf_text_config,
29
30
  )
30
31
  from sglang.srt.layers.quantization import QUANTIZATION_METHODS
@@ -83,6 +84,13 @@ class ModelConfig:
83
84
  **kwargs,
84
85
  )
85
86
 
87
+ self.hf_generation_config = get_generation_config(
88
+ self.model_path,
89
+ trust_remote_code=trust_remote_code,
90
+ revision=revision,
91
+ **kwargs,
92
+ )
93
+
86
94
  self.hf_text_config = get_hf_text_config(self.hf_config)
87
95
  self.attention_chunk_size = getattr(
88
96
  self.hf_text_config, "attention_chunk_size", None
@@ -413,7 +421,9 @@ class ModelConfig:
413
421
  quant_cfg = self._parse_quant_hf_config()
414
422
 
415
423
  if quant_cfg is not None:
416
- quant_method = quant_cfg.get("quant_method", "").lower()
424
+ quant_method = quant_cfg.get(
425
+ "quant_method", "" if not self.quantization else self.quantization
426
+ ).lower()
417
427
 
418
428
  # Detect which checkpoint is it
419
429
  for _, method in QUANTIZATION_METHODS.items():
@@ -465,6 +475,19 @@ class ModelConfig:
465
475
  if eos_ids:
466
476
  # it can be either int or list of int
467
477
  eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
478
+ if eos_ids is None:
479
+ eos_ids = set()
480
+ if self.hf_generation_config:
481
+ generation_eos_ids = getattr(
482
+ self.hf_generation_config, "eos_token_id", None
483
+ )
484
+ if generation_eos_ids:
485
+ generation_eos_ids = (
486
+ {generation_eos_ids}
487
+ if isinstance(generation_eos_ids, int)
488
+ else set(generation_eos_ids)
489
+ )
490
+ eos_ids = eos_ids | generation_eos_ids
468
491
  return eos_ids
469
492
 
470
493
  def maybe_pull_model_tokenizer_from_remote(self) -> None:
@@ -88,9 +88,11 @@ class Conversation:
88
88
  stop_str: Union[str, List[str]] = None
89
89
  # The string that represents an image token in the prompt
90
90
  image_token: str = "<image>"
91
+ video_token: str = "<video>"
91
92
  audio_token: str = "<audio>"
92
93
 
93
94
  image_data: Optional[List[str]] = None
95
+ video_data: Optional[List[str]] = None
94
96
  modalities: Optional[List[str]] = None
95
97
  stop_token_ids: Optional[int] = None
96
98
 
@@ -380,11 +382,15 @@ class Conversation:
380
382
  self.messages.append([role, message])
381
383
 
382
384
  def append_image(self, image: str):
383
- """Append a new message."""
385
+ """Append a new image."""
384
386
  self.image_data.append(image)
385
387
 
388
+ def append_video(self, video: str):
389
+ """Append a new video."""
390
+ self.video_data.append(video)
391
+
386
392
  def append_audio(self, audio: str):
387
- """Append a new message."""
393
+ """Append a new audio."""
388
394
  self.audio_data.append(audio)
389
395
 
390
396
  def update_last_message(self, message: str):
@@ -433,6 +439,7 @@ class Conversation:
433
439
  sep2=self.sep2,
434
440
  stop_str=self.stop_str,
435
441
  image_token=self.image_token,
442
+ video_token=self.video_token,
436
443
  audio_token=self.audio_token,
437
444
  )
438
445
 
@@ -495,8 +502,12 @@ def generate_embedding_convs(
495
502
  sep2=conv_template.sep2,
496
503
  stop_str=conv_template.stop_str,
497
504
  image_data=[],
505
+ video_data=[],
506
+ audio_data=[],
498
507
  modalities=[],
499
508
  image_token=conv_template.image_token,
509
+ video_token=conv_template.video_token,
510
+ audio_token=conv_template.audio_token,
500
511
  )
501
512
  real_content = ""
502
513
 
@@ -557,10 +568,12 @@ def generate_chat_conv(
557
568
  sep2=conv.sep2,
558
569
  stop_str=conv.stop_str,
559
570
  image_data=[],
571
+ video_data=[],
560
572
  audio_data=[],
561
573
  modalities=[],
562
574
  image_token=conv.image_token,
563
575
  audio_token=conv.audio_token,
576
+ video_token=conv.video_token,
564
577
  )
565
578
 
566
579
  if isinstance(request.messages, str):
@@ -602,6 +615,7 @@ def generate_chat_conv(
602
615
  image_token = ""
603
616
 
604
617
  audio_token = conv.audio_token
618
+ video_token = conv.video_token
605
619
  for content in message.content:
606
620
  if content.type == "text":
607
621
  if num_image_url > 16:
@@ -614,6 +628,9 @@ def generate_chat_conv(
614
628
  else:
615
629
  real_content += image_token
616
630
  conv.append_image(content.image_url.url)
631
+ elif content.type == "video_url":
632
+ real_content += video_token
633
+ conv.append_video(content.video_url.url)
617
634
  elif content.type == "audio_url":
618
635
  real_content += audio_token
619
636
  conv.append_audio(content.audio_url.url)
@@ -810,6 +827,7 @@ register_conv_template(
810
827
  sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
811
828
  stop_str=["<|im_end|>"],
812
829
  image_token="<|vision_start|><|image_pad|><|vision_end|>",
830
+ video_token="<|vision_start|><|video_pad|><|vision_end|>",
813
831
  )
814
832
  )
815
833
 
@@ -870,6 +888,7 @@ register_conv_template(
870
888
  sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
871
889
  stop_str=("<|im_end|>", "<|endoftext|>"),
872
890
  image_token="(<image>./</image>)",
891
+ video_token="(<video>./</video>)",
873
892
  )
874
893
  )
875
894
 
@@ -0,0 +1,6 @@
1
+ from sglang.srt.disaggregation.ascend.conn import (
2
+ AscendKVBootstrapServer,
3
+ AscendKVManager,
4
+ AscendKVReceiver,
5
+ AscendKVSender,
6
+ )
@@ -0,0 +1,44 @@
1
+ import logging
2
+
3
+ from sglang.srt.disaggregation.ascend.transfer_engine import AscendTransferEngine
4
+ from sglang.srt.disaggregation.mooncake.conn import (
5
+ MooncakeKVBootstrapServer,
6
+ MooncakeKVManager,
7
+ MooncakeKVReceiver,
8
+ MooncakeKVSender,
9
+ )
10
+ from sglang.srt.utils import get_local_ip_by_remote
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class AscendKVManager(MooncakeKVManager):
16
+ def init_engine(self):
17
+ # TransferEngine initialized on ascend.
18
+ local_ip = get_local_ip_by_remote()
19
+ self.engine = AscendTransferEngine(
20
+ hostname=local_ip,
21
+ npu_id=self.kv_args.gpu_id,
22
+ disaggregation_mode=self.disaggregation_mode,
23
+ )
24
+
25
+ def register_buffer_to_engine(self):
26
+ self.engine.register(
27
+ self.kv_args.kv_data_ptrs[0], sum(self.kv_args.kv_data_lens)
28
+ )
29
+ # The Ascend backend optimize batch registration for small memory blocks.
30
+ self.engine.batch_register(
31
+ self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
32
+ )
33
+
34
+
35
+ class AscendKVSender(MooncakeKVSender):
36
+ pass
37
+
38
+
39
+ class AscendKVReceiver(MooncakeKVReceiver):
40
+ pass
41
+
42
+
43
+ class AscendKVBootstrapServer(MooncakeKVBootstrapServer):
44
+ pass
@@ -0,0 +1,58 @@
1
+ import logging
2
+ import os
3
+ from typing import List, Optional
4
+
5
+ from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
6
+ from sglang.srt.disaggregation.utils import DisaggregationMode
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class AscendTransferEngine(MooncakeTransferEngine):
12
+
13
+ def __init__(
14
+ self, hostname: str, npu_id: int, disaggregation_mode: DisaggregationMode
15
+ ):
16
+ try:
17
+ from mf_adapter import TransferEngine
18
+ except ImportError as e:
19
+ raise ImportError(
20
+ "Please install mf_adapter, for details, see docs/backend/pd_disaggregation.md"
21
+ ) from e
22
+
23
+ self.engine = TransferEngine()
24
+ self.hostname = hostname
25
+ self.npu_id = npu_id
26
+
27
+ # Centralized storage address of the AscendTransferEngine
28
+ self.store_url = os.getenv("ASCEND_MF_STORE_URL")
29
+ if disaggregation_mode == DisaggregationMode.PREFILL:
30
+ self.role = "Prefill"
31
+ elif disaggregation_mode == DisaggregationMode.DECODE:
32
+ self.role = "Decode"
33
+ else:
34
+ logger.error(f"Unsupported DisaggregationMode: {disaggregation_mode}")
35
+ raise ValueError(f"Unsupported DisaggregationMode: {disaggregation_mode}")
36
+ self.session_id = f"{self.hostname}:{self.engine.get_rpc_port()}"
37
+ self.initialize()
38
+
39
+ def initialize(self) -> None:
40
+ """Initialize the ascend transfer instance."""
41
+ ret_value = self.engine.initialize(
42
+ self.store_url,
43
+ self.session_id,
44
+ self.role,
45
+ self.npu_id,
46
+ )
47
+ if ret_value != 0:
48
+ logger.error("Ascend Transfer Engine initialization failed.")
49
+ raise RuntimeError("Ascend Transfer Engine initialization failed.")
50
+
51
+ def batch_register(self, ptrs: List[int], lengths: List[int]):
52
+ try:
53
+ ret_value = self.engine.batch_register_memory(ptrs, lengths)
54
+ except Exception:
55
+ # Mark register as failed
56
+ ret_value = -1
57
+ if ret_value != 0:
58
+ logger.debug(f"Ascend memory registration for ptr {ptrs} failed.")
@@ -132,13 +132,9 @@ class MooncakeKVManager(BaseKVManager):
132
132
  ):
133
133
  self.kv_args = args
134
134
  self.local_ip = get_local_ip_auto()
135
- self.engine = MooncakeTransferEngine(
136
- hostname=self.local_ip,
137
- gpu_id=self.kv_args.gpu_id,
138
- ib_device=self.kv_args.ib_device,
139
- )
140
135
  self.is_mla_backend = is_mla_backend
141
136
  self.disaggregation_mode = disaggregation_mode
137
+ self.init_engine()
142
138
  # for p/d multi node infer
143
139
  self.bootstrap_port = server_args.disaggregation_bootstrap_port
144
140
  self.dist_init_addr = server_args.dist_init_addr
@@ -225,6 +221,13 @@ class MooncakeKVManager(BaseKVManager):
225
221
  self.failure_records: Dict[int, str] = {}
226
222
  self.failure_lock = threading.Lock()
227
223
 
224
+ def init_engine(self):
225
+ self.engine = MooncakeTransferEngine(
226
+ hostname=self.local_ip,
227
+ gpu_id=self.kv_args.gpu_id,
228
+ ib_device=self.kv_args.ib_device,
229
+ )
230
+
228
231
  def register_buffer_to_engine(self):
229
232
  for kv_data_ptr, kv_data_len in zip(
230
233
  self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
@@ -267,19 +270,17 @@ class MooncakeKVManager(BaseKVManager):
267
270
 
268
271
  # Worker function for processing a single layer
269
272
  def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
270
- src_addr_list = []
271
- dst_addr_list = []
272
- length_list = []
273
273
  for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
274
274
  src_addr = src_ptr + int(prefill_index[0]) * item_len
275
275
  dst_addr = dst_ptr + int(decode_index[0]) * item_len
276
276
  length = item_len * len(prefill_index)
277
- src_addr_list.append(src_addr)
278
- dst_addr_list.append(dst_addr)
279
- length_list.append(length)
280
- return self.engine.batch_transfer_sync(
281
- mooncake_session_id, src_addr_list, dst_addr_list, length_list
282
- )
277
+
278
+ status = self.engine.transfer_sync(
279
+ mooncake_session_id, src_addr, dst_addr, length
280
+ )
281
+ if status != 0:
282
+ return status
283
+ return 0
283
284
 
284
285
  futures = [
285
286
  executor.submit(
@@ -1,8 +1,8 @@
1
- import json
2
1
  import logging
3
- from dataclasses import dataclass
4
2
  from typing import List, Optional
5
3
 
4
+ from sglang.srt.utils import get_bool_env_var, get_free_port
5
+
6
6
  logger = logging.getLogger(__name__)
7
7
 
8
8
 
@@ -55,12 +55,21 @@ class MooncakeTransferEngine:
55
55
  device_name: Optional[str],
56
56
  ) -> None:
57
57
  """Initialize the mooncake instance."""
58
- ret_value = self.engine.initialize(
59
- hostname,
60
- "P2PHANDSHAKE",
61
- "rdma",
62
- device_name if device_name is not None else "",
63
- )
58
+ if get_bool_env_var("ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE", "false"):
59
+ hostname += f":{get_free_port()}:npu_{self.gpu_id}"
60
+ ret_value = self.engine.initialize(
61
+ hostname,
62
+ "P2PHANDSHAKE",
63
+ "ascend",
64
+ device_name if device_name is not None else "",
65
+ )
66
+ else:
67
+ ret_value = self.engine.initialize(
68
+ hostname,
69
+ "P2PHANDSHAKE",
70
+ "rdma",
71
+ device_name if device_name is not None else "",
72
+ )
64
73
  if ret_value != 0:
65
74
  logger.error("Mooncake Transfer Engine initialization failed.")
66
75
  raise RuntimeError("Mooncake Transfer Engine initialization failed.")
@@ -15,7 +15,7 @@ import requests
15
15
  import torch
16
16
  import torch.distributed as dist
17
17
 
18
- from sglang.srt.utils import get_ip
18
+ from sglang.srt.utils import get_ip, is_npu
19
19
 
20
20
  if TYPE_CHECKING:
21
21
  from sglang.srt.managers.schedule_batch import Req
@@ -94,8 +94,12 @@ class MetadataBuffers:
94
94
  custom_mem_pool: torch.cuda.MemPool = None,
95
95
  ):
96
96
  self.custom_mem_pool = custom_mem_pool
97
- device = "cuda" if self.custom_mem_pool else "cpu"
98
-
97
+ device = "cpu"
98
+ if is_npu():
99
+ # For ascend backend, output tokens are placed in the NPU and will be transferred by D2D channel.
100
+ device = "npu"
101
+ elif self.custom_mem_pool:
102
+ device = "cuda"
99
103
  with (
100
104
  torch.cuda.use_mem_pool(self.custom_mem_pool)
101
105
  if self.custom_mem_pool
@@ -200,6 +204,7 @@ class MetadataBuffers:
200
204
  class TransferBackend(Enum):
201
205
  MOONCAKE = "mooncake"
202
206
  NIXL = "nixl"
207
+ ASCEND = "ascend"
203
208
  FAKE = "fake"
204
209
 
205
210
 
@@ -231,6 +236,23 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
231
236
  KVClassType.BOOTSTRAP_SERVER: MooncakeKVBootstrapServer,
232
237
  }
233
238
  return class_mapping.get(class_type)
239
+ elif transfer_backend == TransferBackend.ASCEND:
240
+ from sglang.srt.disaggregation.ascend import (
241
+ AscendKVBootstrapServer,
242
+ AscendKVManager,
243
+ AscendKVReceiver,
244
+ AscendKVSender,
245
+ )
246
+ from sglang.srt.disaggregation.base import KVArgs
247
+
248
+ class_mapping = {
249
+ KVClassType.KVARGS: KVArgs,
250
+ KVClassType.MANAGER: AscendKVManager,
251
+ KVClassType.SENDER: AscendKVSender,
252
+ KVClassType.RECEIVER: (AscendKVReceiver),
253
+ KVClassType.BOOTSTRAP_SERVER: AscendKVBootstrapServer,
254
+ }
255
+ return class_mapping.get(class_type)
234
256
  elif transfer_backend == TransferBackend.NIXL:
235
257
  from sglang.srt.disaggregation.base import KVArgs
236
258
  from sglang.srt.disaggregation.nixl import (
@@ -650,7 +650,7 @@ def _set_envs_and_config(server_args: ServerArgs):
650
650
  if _is_cuda:
651
651
  assert_pkg_version(
652
652
  "sgl-kernel",
653
- "0.2.4",
653
+ "0.2.5",
654
654
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
655
655
  )
656
656
 
@@ -418,6 +418,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
418
418
 
419
419
  await _global_state.tokenizer_manager.start_profile(
420
420
  output_dir=obj.output_dir,
421
+ start_step=obj.start_step,
421
422
  num_steps=obj.num_steps,
422
423
  activities=obj.activities,
423
424
  with_stack=obj.with_stack,
@@ -267,6 +267,10 @@ class ChatCompletionMessageContentImageURL(BaseModel):
267
267
  detail: Optional[Literal["auto", "low", "high"]] = "auto"
268
268
 
269
269
 
270
+ class ChatCompletionMessageContentVideoURL(BaseModel):
271
+ url: str
272
+
273
+
270
274
  class ChatCompletionMessageContentAudioURL(BaseModel):
271
275
  url: str
272
276
 
@@ -277,6 +281,11 @@ class ChatCompletionMessageContentImagePart(BaseModel):
277
281
  modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
278
282
 
279
283
 
284
+ class ChatCompletionMessageContentVideoPart(BaseModel):
285
+ type: Literal["video_url"]
286
+ video_url: ChatCompletionMessageContentVideoURL
287
+
288
+
280
289
  class ChatCompletionMessageContentAudioPart(BaseModel):
281
290
  type: Literal["audio_url"]
282
291
  audio_url: ChatCompletionMessageContentAudioURL
@@ -285,6 +294,7 @@ class ChatCompletionMessageContentAudioPart(BaseModel):
285
294
  ChatCompletionMessageContentPart = Union[
286
295
  ChatCompletionMessageContentTextPart,
287
296
  ChatCompletionMessageContentImagePart,
297
+ ChatCompletionMessageContentVideoPart,
288
298
  ChatCompletionMessageContentAudioPart,
289
299
  ]
290
300
 
@@ -629,6 +639,7 @@ class MessageProcessingResult:
629
639
  prompt_ids: Union[str, List[int]]
630
640
  image_data: Optional[Any]
631
641
  audio_data: Optional[Any]
642
+ video_data: Optional[Any]
632
643
  modalities: List[str]
633
644
  stop: List[str]
634
645
  tool_call_constraint: Optional[Any] = None
@@ -82,6 +82,7 @@ class OpenAIServingChat(OpenAIServingBase):
82
82
  adapted_request = GenerateReqInput(
83
83
  **prompt_kwargs,
84
84
  image_data=processed_messages.image_data,
85
+ video_data=processed_messages.video_data,
85
86
  audio_data=processed_messages.audio_data,
86
87
  sampling_params=sampling_params,
87
88
  return_logprob=request.logprobs,
@@ -143,6 +144,7 @@ class OpenAIServingChat(OpenAIServingBase):
143
144
  prompt_ids = []
144
145
  openai_compatible_messages = []
145
146
  image_data = []
147
+ video_data = []
146
148
  audio_data = []
147
149
  modalities = []
148
150
 
@@ -158,6 +160,7 @@ class OpenAIServingChat(OpenAIServingBase):
158
160
  msg_dict,
159
161
  template_content_format,
160
162
  image_data,
163
+ video_data,
161
164
  audio_data,
162
165
  modalities,
163
166
  )
@@ -214,11 +217,13 @@ class OpenAIServingChat(OpenAIServingBase):
214
217
  stop = request.stop
215
218
  image_data = image_data if image_data else None
216
219
  audio_data = audio_data if audio_data else None
220
+ video_data = video_data if video_data else None
217
221
  modalities = modalities if modalities else []
218
222
  return MessageProcessingResult(
219
223
  prompt=prompt,
220
224
  prompt_ids=prompt_ids,
221
225
  image_data=image_data,
226
+ video_data=video_data,
222
227
  audio_data=audio_data,
223
228
  modalities=modalities,
224
229
  stop=stop,
@@ -260,6 +265,7 @@ class OpenAIServingChat(OpenAIServingBase):
260
265
  prompt = conv.get_prompt()
261
266
 
262
267
  image_data = conv.image_data if conv.image_data else None
268
+ video_data = conv.video_data if conv.video_data else None
263
269
  audio_data = conv.audio_data if conv.audio_data else None
264
270
  modalities = conv.modalities if conv.modalities else []
265
271
  stop = copy.copy(conv.stop_str or [] if not request.ignore_eos else [])
@@ -277,6 +283,7 @@ class OpenAIServingChat(OpenAIServingBase):
277
283
  prompt=prompt,
278
284
  prompt_ids=prompt_ids,
279
285
  image_data=image_data,
286
+ video_data=video_data,
280
287
  audio_data=audio_data,
281
288
  modalities=modalities,
282
289
  stop=stop,
@@ -10,6 +10,7 @@ from sglang.srt.entrypoints.openai.protocol import (
10
10
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
11
11
  from sglang.srt.function_call.core_types import ToolCallItem
12
12
  from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
13
+ from sglang.srt.function_call.kimik2_detector import KimiK2Detector
13
14
  from sglang.srt.function_call.llama32_detector import Llama32Detector
14
15
  from sglang.srt.function_call.mistral_detector import MistralDetector
15
16
  from sglang.srt.function_call.pythonic_detector import PythonicDetector
@@ -33,6 +34,7 @@ class FunctionCallParser:
33
34
  "mistral": MistralDetector,
34
35
  "deepseekv3": DeepSeekV3Detector,
35
36
  "pythonic": PythonicDetector,
37
+ "kimi_k2": KimiK2Detector,
36
38
  }
37
39
 
38
40
  def __init__(self, tools: List[Tool], tool_call_parser: str):