sglang 0.4.8.post1__tar.gz → 0.4.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (740) hide show
  1. {sglang-0.4.8.post1/sglang.egg-info → sglang-0.4.9}/PKG-INFO +9 -6
  2. {sglang-0.4.8.post1 → sglang-0.4.9}/README.md +2 -0
  3. {sglang-0.4.8.post1 → sglang-0.4.9}/pyproject.toml +8 -9
  4. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/bench_one_batch_server.py +17 -2
  5. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/bench_serving.py +168 -22
  6. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/internvl.py +4 -2
  7. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/janus_pro.py +1 -1
  8. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/model_config.py +48 -0
  9. sglang-0.4.9/sglang/srt/configs/update_config.py +119 -0
  10. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/conversation.py +34 -0
  11. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/decode.py +21 -5
  12. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/nixl/conn.py +6 -6
  13. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/prefill.py +2 -2
  14. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/utils.py +1 -1
  15. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/parallel_state.py +44 -17
  16. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/EngineBase.py +8 -0
  17. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/engine.py +40 -6
  18. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/http_server.py +111 -24
  19. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/protocol.py +4 -2
  20. {sglang-0.4.8.post1/sglang/srt/managers → sglang-0.4.9/sglang/srt/eplb}/eplb_algorithms/__init__.py +1 -1
  21. {sglang-0.4.8.post1/sglang/srt/managers → sglang-0.4.9/sglang/srt/eplb}/eplb_manager.py +2 -4
  22. {sglang-0.4.8.post1/sglang/srt → sglang-0.4.9/sglang/srt/eplb}/eplb_simulator/reader.py +1 -1
  23. {sglang-0.4.8.post1/sglang/srt/managers → sglang-0.4.9/sglang/srt/eplb}/expert_distribution.py +1 -5
  24. {sglang-0.4.8.post1/sglang/srt/managers → sglang-0.4.9/sglang/srt/eplb}/expert_location.py +1 -1
  25. {sglang-0.4.8.post1/sglang/srt/managers → sglang-0.4.9/sglang/srt/eplb}/expert_location_dispatch.py +1 -1
  26. {sglang-0.4.8.post1/sglang/srt/model_executor → sglang-0.4.9/sglang/srt/eplb}/expert_location_updater.py +17 -1
  27. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/hf_transformers_utils.py +2 -1
  28. sglang-0.4.9/sglang/srt/layers/amx_utils.py +86 -0
  29. sglang-0.4.9/sglang/srt/layers/attention/ascend_backend.py +219 -0
  30. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/flashattention_backend.py +32 -9
  31. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/tbo_backend.py +37 -9
  32. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/communicator.py +18 -2
  33. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/dp_attention.py +9 -3
  34. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/elementwise.py +76 -12
  35. sglang-0.4.9/sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
  36. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/layernorm.py +26 -0
  37. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/linear.py +84 -14
  38. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/logits_processor.py +4 -4
  39. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
  40. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/ep_moe/layer.py +36 -13
  41. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
  42. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -2
  43. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -16
  44. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/router.py +60 -22
  45. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/topk.py +10 -28
  46. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/parameter.py +67 -7
  47. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
  48. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/fp8.py +44 -0
  49. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  50. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/fp8_utils.py +1 -2
  51. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/gptq.py +5 -1
  52. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/moe_wna16.py +1 -1
  53. sglang-0.4.9/sglang/srt/layers/quantization/quant_utils.py +166 -0
  54. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/w8a8_int8.py +52 -1
  55. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/rotary_embedding.py +2 -2
  56. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/vocab_parallel_embedding.py +11 -7
  57. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/lora.py +4 -5
  58. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/lora_manager.py +73 -20
  59. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/configure_logging.py +1 -1
  60. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/io_struct.py +50 -13
  61. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/mm_utils.py +73 -59
  62. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/multimodal_processor.py +2 -6
  63. sglang-0.4.9/sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
  64. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/schedule_batch.py +77 -84
  65. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/scheduler.py +113 -59
  66. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
  67. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/session_controller.py +12 -3
  68. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/tokenizer_manager.py +314 -103
  69. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/tp_worker.py +13 -1
  70. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
  71. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/mem_cache/allocator.py +290 -0
  72. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/mem_cache/chunk_cache.py +34 -2
  73. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/mem_cache/memory_pool.py +289 -3
  74. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/mem_cache/multimodal_cache.py +3 -0
  75. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/model_executor/cuda_graph_runner.py +2 -1
  76. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/model_executor/forward_batch_info.py +17 -4
  77. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/model_executor/model_runner.py +297 -56
  78. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/model_loader/loader.py +41 -0
  79. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/model_loader/weight_utils.py +72 -4
  80. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/deepseek_nextn.py +1 -3
  81. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/deepseek_v2.py +181 -45
  82. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/deepseek_vl2.py +3 -5
  83. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gemma3_causal.py +1 -2
  84. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gemma3n_causal.py +4 -3
  85. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gemma3n_mm.py +4 -20
  86. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/hunyuan.py +1 -1
  87. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/kimi_vl.py +1 -2
  88. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/llama.py +10 -4
  89. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/llama4.py +32 -45
  90. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/llama_eagle3.py +61 -11
  91. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/llava.py +5 -5
  92. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/minicpmo.py +2 -2
  93. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/mistral.py +1 -1
  94. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/mllama4.py +43 -11
  95. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/phi4mm.py +1 -3
  96. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/pixtral.py +3 -7
  97. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen2.py +31 -3
  98. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen2_5_vl.py +1 -3
  99. sglang-0.4.9/sglang/srt/models/qwen2_audio.py +200 -0
  100. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen2_moe.py +32 -6
  101. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen2_vl.py +1 -4
  102. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen3.py +94 -25
  103. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen3_moe.py +68 -21
  104. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/vila.py +3 -8
  105. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/base_processor.py +140 -158
  106. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/clip.py +2 -13
  107. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/deepseek_vl_v2.py +4 -11
  108. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/gemma3.py +3 -10
  109. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/gemma3n.py +5 -20
  110. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/internvl.py +3 -10
  111. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/janus_pro.py +3 -9
  112. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/kimi_vl.py +6 -13
  113. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/llava.py +2 -10
  114. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/minicpm.py +5 -12
  115. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/mlama.py +2 -14
  116. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/mllama4.py +3 -6
  117. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/phi4mm.py +4 -14
  118. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/pixtral.py +3 -9
  119. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/qwen_vl.py +8 -14
  120. {sglang-0.4.8.post1/sglang/srt/managers/multimodal_processors → sglang-0.4.9/sglang/srt/multimodal/processors}/vila.py +13 -31
  121. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/operations_strategy.py +6 -2
  122. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/reasoning_parser.py +26 -0
  123. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/sampling/sampling_batch_info.py +39 -1
  124. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/server_args.py +69 -22
  125. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/speculative/build_eagle_tree.py +57 -18
  126. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/speculative/eagle_worker.py +6 -4
  127. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/two_batch_overlap.py +200 -27
  128. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/utils.py +306 -146
  129. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/warmup.py +12 -3
  130. sglang-0.4.9/sglang/test/attention/__init__.py +0 -0
  131. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/runners.py +10 -1
  132. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_utils.py +15 -3
  133. sglang-0.4.9/sglang/version.py +1 -0
  134. {sglang-0.4.8.post1 → sglang-0.4.9/sglang.egg-info}/PKG-INFO +9 -6
  135. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang.egg-info/SOURCES.txt +35 -28
  136. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang.egg-info/requires.txt +6 -5
  137. sglang-0.4.8.post1/sglang/math_utils.py +0 -8
  138. sglang-0.4.8.post1/sglang/version.py +0 -1
  139. {sglang-0.4.8.post1 → sglang-0.4.9}/LICENSE +0 -0
  140. {sglang-0.4.8.post1 → sglang-0.4.9}/setup.cfg +0 -0
  141. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/__init__.py +0 -0
  142. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/api.py +0 -0
  143. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/bench_offline_throughput.py +0 -0
  144. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/bench_one_batch.py +0 -0
  145. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/check_env.py +0 -0
  146. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/compile_deep_gemm.py +0 -0
  147. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/eval/llama3_eval.py +0 -0
  148. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/eval/loogle_eval.py +0 -0
  149. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/global_config.py +0 -0
  150. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/backend/__init__.py +0 -0
  151. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/backend/anthropic.py +0 -0
  152. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/backend/base_backend.py +0 -0
  153. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/backend/litellm.py +0 -0
  154. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/backend/openai.py +0 -0
  155. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/backend/runtime_endpoint.py +0 -0
  156. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/backend/vertexai.py +0 -0
  157. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/chat_template.py +0 -0
  158. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/choices.py +0 -0
  159. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/compiler.py +0 -0
  160. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/interpreter.py +0 -0
  161. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/ir.py +0 -0
  162. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/lang/tracer.py +0 -0
  163. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/launch_server.py +0 -0
  164. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/profiler.py +0 -0
  165. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/_custom_ops.py +0 -0
  166. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/aio_rwlock.py +0 -0
  167. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/code_completion_parser.py +0 -0
  168. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/__init__.py +0 -0
  169. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/chatglm.py +0 -0
  170. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/dbrx.py +0 -0
  171. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/deepseekvl2.py +0 -0
  172. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/device_config.py +0 -0
  173. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/exaone.py +0 -0
  174. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/kimi_vl.py +0 -0
  175. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/kimi_vl_moonvit.py +0 -0
  176. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/load_config.py +0 -0
  177. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/utils.py +0 -0
  178. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/connector/__init__.py +0 -0
  179. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/connector/base_connector.py +0 -0
  180. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/connector/redis.py +0 -0
  181. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/connector/s3.py +0 -0
  182. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/connector/serde/__init__.py +0 -0
  183. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/connector/serde/safe_serde.py +0 -0
  184. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/connector/serde/serde.py +0 -0
  185. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/connector/utils.py +0 -0
  186. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/constants.py +0 -0
  187. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  188. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/constrained/llguidance_backend.py +0 -0
  189. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/constrained/outlines_backend.py +0 -0
  190. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  191. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
  192. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
  193. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  194. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/custom_op.py +0 -0
  195. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/debug_utils.py +0 -0
  196. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/base/__init__.py +0 -0
  197. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/base/conn.py +0 -0
  198. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/common/__init__.py +0 -0
  199. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/common/conn.py +0 -0
  200. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/common/utils.py +0 -0
  201. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/decode_schedule_batch_mixin.py +0 -0
  202. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/fake/__init__.py +0 -0
  203. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/fake/conn.py +0 -0
  204. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/kv_events.py +0 -0
  205. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/launch_lb.py +0 -0
  206. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/mini_lb.py +0 -0
  207. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
  208. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/mooncake/conn.py +0 -0
  209. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
  210. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
  211. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/__init__.py +0 -0
  212. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/communication_op.py +0 -0
  213. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  214. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  215. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  216. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  217. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/npu_communicator.py +0 -0
  218. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/pymscclpp.py +0 -0
  219. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  220. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  221. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  222. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  223. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/distributed/utils.py +0 -0
  224. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/http_server_engine.py +0 -0
  225. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/__init__.py +0 -0
  226. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/serving_base.py +0 -0
  227. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/serving_chat.py +0 -0
  228. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/serving_completions.py +0 -0
  229. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/serving_embedding.py +0 -0
  230. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/serving_rerank.py +0 -0
  231. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/serving_score.py +0 -0
  232. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/usage_processor.py +0 -0
  233. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/entrypoints/openai/utils.py +0 -0
  234. {sglang-0.4.8.post1/sglang/srt/layers/moe/ep_moe → sglang-0.4.9/sglang/srt/eplb}/__init__.py +0 -0
  235. {sglang-0.4.8.post1/sglang/srt/managers → sglang-0.4.9/sglang/srt/eplb}/eplb_algorithms/deepseek.py +0 -0
  236. {sglang-0.4.8.post1/sglang/srt/managers → sglang-0.4.9/sglang/srt/eplb}/eplb_algorithms/deepseek_vec.py +0 -0
  237. {sglang-0.4.8.post1/sglang/srt → sglang-0.4.9/sglang/srt/eplb}/eplb_simulator/__init__.py +0 -0
  238. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/base_format_detector.py +0 -0
  239. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/core_types.py +0 -0
  240. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/deepseekv3_detector.py +0 -0
  241. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/ebnf_composer.py +0 -0
  242. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/function_call_parser.py +0 -0
  243. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/llama32_detector.py +0 -0
  244. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/mistral_detector.py +0 -0
  245. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/pythonic_detector.py +0 -0
  246. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/qwen25_detector.py +0 -0
  247. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/function_call/utils.py +0 -0
  248. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/jinja_template_utils.py +0 -0
  249. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/activation.py +2 -2
  250. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/aiter_backend.py +0 -0
  251. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  252. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/cutlass_mla_backend.py +0 -0
  253. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  254. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  255. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  256. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
  257. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/intel_amx_backend.py +0 -0
  258. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/merge_state.py +0 -0
  259. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  260. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/triton_backend.py +0 -0
  261. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  262. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  263. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  264. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/triton_ops/merge_state.py +0 -0
  265. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  266. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  267. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/utils.py +0 -0
  268. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/attention/vision.py +0 -0
  269. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/cutlass_moe.py +0 -0
  270. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/cutlass_moe_params.py +0 -0
  271. {sglang-0.4.8.post1/sglang/srt/layers/quantization/compressed_tensors → sglang-0.4.9/sglang/srt/layers/moe/ep_moe}/__init__.py +0 -0
  272. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  273. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  274. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  275. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  276. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  277. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  278. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  279. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  280. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  281. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  282. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  283. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  284. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  285. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  286. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  287. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  288. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  289. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  290. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  291. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  292. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  293. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  294. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  295. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  296. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  297. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  298. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  299. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  300. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  301. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  302. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  303. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  304. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  305. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  306. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  307. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  308. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  309. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  310. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  311. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  312. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  313. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  314. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  315. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  316. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  317. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  318. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  319. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  320. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  321. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  322. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  323. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  324. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  325. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  326. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  327. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  328. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  329. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  330. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  331. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  332. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  333. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  334. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  335. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  336. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  337. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  338. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  339. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  340. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  341. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  342. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  343. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  344. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  345. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  346. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  347. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  348. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  349. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  350. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  351. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  352. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  353. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  354. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  355. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  356. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  357. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  358. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  359. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  360. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  361. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  362. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  363. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  364. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  365. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  366. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  367. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  368. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  369. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  370. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  371. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  372. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  373. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  374. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  375. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  376. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  377. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  378. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  379. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  380. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  381. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  382. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  383. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  384. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  385. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  386. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  387. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  388. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  389. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  390. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  391. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  392. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  393. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  394. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  395. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  396. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  397. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  398. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  399. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  400. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  401. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  402. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  403. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json +0 -0
  404. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json +0 -0
  405. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  406. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json +0 -0
  408. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json +0 -0
  410. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  411. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  412. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  413. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json +0 -0
  415. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json +0 -0
  417. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json +0 -0
  418. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  424. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  428. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  431. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  432. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  433. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  436. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/multimodal.py +0 -0
  437. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/pooler.py +0 -0
  438. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/__init__.py +0 -0
  439. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/awq.py +0 -0
  440. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/base_config.py +0 -0
  441. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
  442. {sglang-0.4.8.post1/sglang/test → sglang-0.4.9/sglang/srt/layers/quantization/compressed_tensors}/__init__.py +0 -0
  443. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
  444. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  445. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  446. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  447. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  448. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  449. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  488. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  489. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  490. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  491. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  492. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  493. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  494. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  495. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  496. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  497. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  498. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  499. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  500. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  501. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  502. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  503. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  504. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  505. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  506. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  507. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  508. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  509. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  510. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  511. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  512. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  513. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  514. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  515. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  516. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  517. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  518. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  519. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  520. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  521. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  522. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  523. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  524. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  525. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  526. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  527. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  528. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  529. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  530. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  531. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  532. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  533. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  534. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  535. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  536. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  537. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  538. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  539. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  540. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  541. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  542. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  543. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  544. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  545. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  546. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  547. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  548. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  549. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  550. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  551. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  552. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  553. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  554. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  555. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  556. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  557. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  558. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  559. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  560. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  561. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  562. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  563. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  564. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  565. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  566. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  567. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  568. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  569. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  570. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  571. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  572. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  573. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  574. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  575. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  576. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  577. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  578. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  579. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  580. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  581. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  582. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  583. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  584. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  585. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  586. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  587. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  588. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  589. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  590. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  591. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  592. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  593. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  594. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  595. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  596. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  597. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  598. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  599. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  600. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  601. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +0 -0
  602. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +0 -0
  603. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -0
  604. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +0 -0
  605. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  606. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  607. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/kv_cache.py +0 -0
  608. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  609. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/qoq.py +0 -0
  610. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/utils.py +0 -0
  611. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
  612. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/radix_attention.py +0 -0
  613. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/sampler.py +0 -0
  614. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/torchao_utils.py +0 -0
  615. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/layers/utils.py +0 -0
  616. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/backend/base_backend.py +0 -0
  617. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  618. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/backend/triton_backend.py +0 -0
  619. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/layers.py +0 -0
  620. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/lora_config.py +0 -0
  621. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/mem_pool.py +0 -0
  622. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  623. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  624. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  625. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  626. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  627. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/lora/utils.py +0 -0
  628. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/cache_controller.py +0 -0
  629. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/data_parallel_controller.py +0 -0
  630. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/detokenizer_manager.py +0 -0
  631. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/schedule_policy.py +0 -0
  632. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/template_manager.py +0 -0
  633. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/managers/utils.py +0 -0
  634. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  635. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/mem_cache/flush_cache.py +0 -0
  636. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
  637. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/mem_cache/memory_pool_host.py +0 -0
  638. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/mem_cache/radix_cache.py +0 -0
  639. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/metrics/collector.py +0 -0
  640. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/metrics/func_timer.py +0 -0
  641. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/model_loader/__init__.py +0 -0
  642. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/model_loader/utils.py +0 -0
  643. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/model_parallel.py +0 -0
  644. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/baichuan.py +0 -0
  645. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/bert.py +0 -0
  646. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/chatglm.py +0 -0
  647. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/clip.py +0 -0
  648. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/commandr.py +0 -0
  649. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/dbrx.py +0 -0
  650. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/deepseek.py +0 -0
  651. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  652. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/exaone.py +0 -0
  653. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gemma.py +0 -0
  654. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gemma2.py +0 -0
  655. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gemma2_reward.py +0 -0
  656. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gemma3_mm.py +0 -0
  657. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gemma3n_audio.py +0 -0
  658. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/glm4.py +0 -0
  659. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gpt2.py +0 -0
  660. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/gpt_bigcode.py +0 -0
  661. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/granite.py +0 -0
  662. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/grok.py +0 -0
  663. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/idefics2.py +0 -0
  664. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/internlm2.py +0 -0
  665. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/internlm2_reward.py +0 -0
  666. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/internvl.py +0 -0
  667. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/kimi_vl_moonvit.py +0 -0
  668. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/llama_classification.py +0 -0
  669. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/llama_eagle.py +0 -0
  670. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/llama_embedding.py +0 -0
  671. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/llama_reward.py +0 -0
  672. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/llavavid.py +0 -0
  673. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/mimo.py +0 -0
  674. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/mimo_mtp.py +0 -0
  675. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/minicpm.py +0 -0
  676. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/minicpm3.py +0 -0
  677. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/minicpmv.py +0 -0
  678. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/mixtral.py +0 -0
  679. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/mixtral_quant.py +0 -0
  680. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/mllama.py +0 -0
  681. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/olmo.py +0 -0
  682. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/olmo2.py +0 -0
  683. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/olmoe.py +0 -0
  684. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/phi3_small.py +0 -0
  685. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen.py +0 -0
  686. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen2_classification.py +0 -0
  687. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen2_eagle.py +0 -0
  688. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/qwen2_rm.py +0 -0
  689. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/registry.py +0 -0
  690. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/roberta.py +0 -0
  691. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/siglip.py +0 -0
  692. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/stablelm.py +0 -0
  693. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/torch_native_llama.py +0 -0
  694. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/transformers.py +0 -0
  695. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/xverse.py +0 -0
  696. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/xverse_moe.py +0 -0
  697. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/models/yivl.py +0 -0
  698. {sglang-0.4.8.post1/sglang/srt → sglang-0.4.9/sglang/srt/multimodal}/mm_utils.py +0 -0
  699. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/operations.py +0 -0
  700. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/patch_torch.py +0 -0
  701. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  702. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  703. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  704. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  705. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  706. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  707. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/sampling/sampling_params.py +0 -0
  708. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
  709. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +0 -0
  710. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/speculative/eagle_utils.py +0 -0
  711. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/speculative/spec_info.py +0 -0
  712. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  713. {sglang-0.4.8.post1/sglang/test/attention → sglang-0.4.9/sglang/test}/__init__.py +0 -0
  714. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/attention/test_flashattn_backend.py +0 -0
  715. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
  716. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
  717. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/few_shot_gsm8k.py +0 -0
  718. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  719. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/run_eval.py +0 -0
  720. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/send_one.py +0 -0
  721. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/simple_eval_common.py +0 -0
  722. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/simple_eval_gpqa.py +0 -0
  723. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/simple_eval_humaneval.py +0 -0
  724. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/simple_eval_math.py +0 -0
  725. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/simple_eval_mgsm.py +0 -0
  726. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/simple_eval_mmlu.py +0 -0
  727. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_activation.py +0 -0
  728. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_block_fp8.py +0 -0
  729. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -0
  730. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_block_fp8_ep.py +0 -0
  731. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_custom_ops.py +0 -0
  732. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_cutlass_moe.py +0 -0
  733. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_deepep_utils.py +0 -0
  734. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_dynamic_grad_mode.py +0 -0
  735. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_fp4_moe.py +0 -0
  736. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_layernorm.py +0 -0
  737. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/test/test_programs.py +0 -0
  738. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang/utils.py +0 -0
  739. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang.egg-info/dependency_links.txt +0 -0
  740. {sglang-0.4.8.post1 → sglang-0.4.9}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.8.post1
3
+ Version: 0.4.9
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -219,6 +219,7 @@ Requires-Dist: IPython
219
219
  Requires-Dist: setproctitle
220
220
  Provides-Extra: runtime-common
221
221
  Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
222
+ Requires-Dist: build; extra == "runtime-common"
222
223
  Requires-Dist: compressed-tensors; extra == "runtime-common"
223
224
  Requires-Dist: datasets; extra == "runtime-common"
224
225
  Requires-Dist: fastapi; extra == "runtime-common"
@@ -243,19 +244,20 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
243
244
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
244
245
  Requires-Dist: scipy; extra == "runtime-common"
245
246
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
246
- Requires-Dist: transformers==4.52.3; extra == "runtime-common"
247
+ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
248
+ Requires-Dist: timm==1.0.16; extra == "runtime-common"
247
249
  Requires-Dist: uvicorn; extra == "runtime-common"
248
250
  Requires-Dist: uvloop; extra == "runtime-common"
249
251
  Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
250
252
  Provides-Extra: srt
251
253
  Requires-Dist: sglang[runtime_common]; extra == "srt"
252
- Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
254
+ Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
253
255
  Requires-Dist: torch==2.7.1; extra == "srt"
254
256
  Requires-Dist: torchaudio==2.7.1; extra == "srt"
255
257
  Requires-Dist: torchvision==0.22.1; extra == "srt"
256
258
  Requires-Dist: cuda-python; extra == "srt"
257
259
  Requires-Dist: einops; extra == "srt"
258
- Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
260
+ Requires-Dist: flashinfer_python==0.2.7.post1; extra == "srt"
259
261
  Provides-Extra: blackwell
260
262
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
261
263
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -264,7 +266,7 @@ Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
264
266
  Requires-Dist: torchvision==0.22.1; extra == "blackwell"
265
267
  Requires-Dist: cuda-python; extra == "blackwell"
266
268
  Requires-Dist: einops; extra == "blackwell"
267
- Requires-Dist: flashinfer_python==0.2.6.post1; extra == "blackwell"
269
+ Requires-Dist: flashinfer_python==0.2.7.post1; extra == "blackwell"
268
270
  Provides-Extra: srt-hip
269
271
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
270
272
  Requires-Dist: torch; extra == "srt-hip"
@@ -295,7 +297,6 @@ Requires-Dist: jsonlines; extra == "test"
295
297
  Requires-Dist: matplotlib; extra == "test"
296
298
  Requires-Dist: pandas; extra == "test"
297
299
  Requires-Dist: peft; extra == "test"
298
- Requires-Dist: timm; extra == "test"
299
300
  Requires-Dist: sentence_transformers; extra == "test"
300
301
  Provides-Extra: all
301
302
  Requires-Dist: sglang[srt]; extra == "all"
@@ -373,6 +374,8 @@ Dynamic: license-file
373
374
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
374
375
 
375
376
  ## News
377
+ - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
378
+ - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
376
379
  - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
377
380
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
378
381
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
@@ -20,6 +20,8 @@
20
20
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
21
21
 
22
22
  ## News
23
+ - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
24
+ - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
23
25
  - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
24
26
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
25
27
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.8.post1"
7
+ version = "0.4.9"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -18,6 +18,7 @@ dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle
18
18
  [project.optional-dependencies]
19
19
  runtime_common = [
20
20
  "blobfile==3.0.0",
21
+ "build",
21
22
  "compressed-tensors",
22
23
  "datasets",
23
24
  "fastapi",
@@ -42,7 +43,8 @@ runtime_common = [
42
43
  "soundfile==0.13.1",
43
44
  "scipy",
44
45
  "torchao==0.9.0",
45
- "transformers==4.52.3",
46
+ "transformers==4.53.0",
47
+ "timm==1.0.16",
46
48
  "uvicorn",
47
49
  "uvloop",
48
50
  "xgrammar==0.1.19",
@@ -50,13 +52,13 @@ runtime_common = [
50
52
 
51
53
  srt = [
52
54
  "sglang[runtime_common]",
53
- "sgl-kernel==0.1.9",
55
+ "sgl-kernel==0.2.4",
54
56
  "torch==2.7.1",
55
57
  "torchaudio==2.7.1",
56
58
  "torchvision==0.22.1",
57
59
  "cuda-python",
58
60
  "einops",
59
- "flashinfer_python==0.2.6.post1",
61
+ "flashinfer_python==0.2.7.post1",
60
62
  ]
61
63
 
62
64
  blackwell = [
@@ -67,7 +69,7 @@ blackwell = [
67
69
  "torchvision==0.22.1",
68
70
  "cuda-python",
69
71
  "einops",
70
- "flashinfer_python==0.2.6.post1",
72
+ "flashinfer_python==0.2.7.post1",
71
73
  ]
72
74
 
73
75
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
@@ -86,9 +88,7 @@ srt_xpu = ["sglang[runtime_common]"]
86
88
  # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
87
89
  srt_hpu = ["sglang[runtime_common]"]
88
90
 
89
- # CPU: currently, there are no pre-built vllm wheels for CPU.
90
- # To install vllm for CPU, please follow the instruction here:
91
- # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
91
+ # CPU: torch wheel for CPU needs to be installed from https://download.pytorch.org/whl/cpu
92
92
  srt_cpu = ["sglang[runtime_common]", "einops"]
93
93
  # https://vllm-ascend.readthedocs.io/en/latest/installation.html
94
94
  srt_npu = ["sglang[runtime_common]"]
@@ -104,7 +104,6 @@ test = [
104
104
  "matplotlib",
105
105
  "pandas",
106
106
  "peft",
107
- "timm",
108
107
  "sentence_transformers",
109
108
  ]
110
109
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[torch_memory_saver]", "sglang[decord]"]
@@ -38,6 +38,7 @@ class BenchArgs:
38
38
  output_len: Tuple[int] = (16,)
39
39
  temperature: float = 0.0
40
40
  return_logprob: bool = False
41
+ client_stream_interval: int = 1
41
42
  input_len_step_percentage: float = 0.0
42
43
  result_filename: str = "result.jsonl"
43
44
  base_url: str = ""
@@ -60,6 +61,11 @@ class BenchArgs:
60
61
  )
61
62
  parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
62
63
  parser.add_argument("--return-logprob", action="store_true")
64
+ parser.add_argument(
65
+ "--client-stream-interval",
66
+ type=int,
67
+ default=BenchArgs.client_stream_interval,
68
+ )
63
69
  parser.add_argument(
64
70
  "--input-len-step-percentage",
65
71
  type=float,
@@ -120,6 +126,7 @@ def run_one_case(
120
126
  output_len: int,
121
127
  temperature: float,
122
128
  return_logprob: bool,
129
+ stream_interval: int,
123
130
  input_len_step_percentage: float,
124
131
  run_name: str,
125
132
  result_filename: str,
@@ -168,6 +175,7 @@ def run_one_case(
168
175
  "max_new_tokens": output_len,
169
176
  "ignore_eos": True,
170
177
  "json_schema": json_schema,
178
+ "stream_interval": stream_interval,
171
179
  },
172
180
  "return_logprob": return_logprob,
173
181
  "stream": True,
@@ -245,8 +253,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
245
253
  else:
246
254
  proc, base_url = launch_server_process(server_args)
247
255
 
248
- tokenizer_id = server_args.tokenizer_path or server_args.model_path
249
- tokenizer = get_tokenizer(tokenizer_id)
256
+ server_info = requests.get(base_url + "/get_server_info").json()
257
+ if "tokenizer_path" in server_info:
258
+ tokenizer_path = server_info["tokenizer_path"]
259
+ elif "prefill" in server_info:
260
+ tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
261
+ tokenizer = get_tokenizer(tokenizer_path)
250
262
 
251
263
  # warmup
252
264
  if not bench_args.skip_warmup:
@@ -258,6 +270,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
258
270
  output_len=16,
259
271
  temperature=bench_args.temperature,
260
272
  return_logprob=bench_args.return_logprob,
273
+ stream_interval=bench_args.client_stream_interval,
261
274
  input_len_step_percentage=bench_args.input_len_step_percentage,
262
275
  run_name="",
263
276
  result_filename="",
@@ -280,6 +293,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
280
293
  ol,
281
294
  temperature=bench_args.temperature,
282
295
  return_logprob=bench_args.return_logprob,
296
+ stream_interval=bench_args.client_stream_interval,
283
297
  input_len_step_percentage=bench_args.input_len_step_percentage,
284
298
  run_name=bench_args.run_name,
285
299
  result_filename=bench_args.result_filename,
@@ -301,6 +315,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
301
315
  ol,
302
316
  temperature=bench_args.temperature,
303
317
  return_logprob=bench_args.return_logprob,
318
+ stream_interval=bench_args.client_stream_interval,
304
319
  input_len_step_percentage=bench_args.input_len_step_percentage,
305
320
  run_name=bench_args.run_name,
306
321
  result_filename=bench_args.result_filename,
@@ -265,6 +265,138 @@ async def async_request_openai_completions(
265
265
  return output
266
266
 
267
267
 
268
+ async def async_request_openai_chat_completions(
269
+ request_func_input: RequestFuncInput,
270
+ pbar: Optional[tqdm] = None,
271
+ ) -> RequestFuncOutput:
272
+ """Makes a request to the OpenAI Chat Completions API.
273
+
274
+ Handles both streaming and non-streaming responses, including support
275
+ for image data in messages. Calculates and returns various performance
276
+ metrics.
277
+
278
+ Args:
279
+ request_func_input: Input parameters for the request.
280
+ pbar: Optional tqdm progress bar to update.
281
+
282
+ Returns:
283
+ RequestFuncOutput: Output of the request, including generated text,
284
+ latency, TTFT, ITL, and success status.
285
+ """
286
+ api_url = request_func_input.api_url
287
+ assert api_url.endswith(
288
+ "chat/completions"
289
+ ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
290
+
291
+ if request_func_input.image_data:
292
+ messages = [
293
+ {
294
+ "role": "user",
295
+ "content": [
296
+ {
297
+ "type": "image_url",
298
+ "image_url": {"url": request_func_input.image_data},
299
+ },
300
+ {"type": "text", "text": request_func_input.prompt},
301
+ ],
302
+ },
303
+ ]
304
+ else:
305
+ messages = [{"role": "user", "content": request_func_input.prompt}]
306
+
307
+ async with _create_bench_client_session() as session:
308
+ payload = {
309
+ "model": request_func_input.model,
310
+ "messages": messages,
311
+ "temperature": 0.0,
312
+ "max_tokens": request_func_input.output_len,
313
+ "stream": not args.disable_stream,
314
+ **request_func_input.extra_request_body,
315
+ }
316
+ headers = get_auth_headers()
317
+
318
+ output = RequestFuncOutput.init_new(request_func_input)
319
+
320
+ generated_text = ""
321
+ output_len = request_func_input.output_len
322
+ ttft = 0.0
323
+ st = time.perf_counter()
324
+ most_recent_timestamp = st
325
+ try:
326
+ async with session.post(
327
+ url=api_url, json=payload, headers=headers
328
+ ) as response:
329
+ if response.status == 200:
330
+ if args.disable_stream:
331
+ # Non-streaming response
332
+ response_json = await response.json()
333
+ output.generated_text = response_json["choices"][0]["message"][
334
+ "content"
335
+ ]
336
+ output.success = True
337
+ output.latency = time.perf_counter() - st
338
+ output.ttft = (
339
+ output.latency
340
+ ) # For non-streaming, TTFT = total latency
341
+ output.output_len = response_json.get("usage", {}).get(
342
+ "completion_tokens", output_len
343
+ )
344
+ else:
345
+ # Streaming response
346
+ async for chunk_bytes in response.content:
347
+ chunk_bytes = chunk_bytes.strip()
348
+ if not chunk_bytes:
349
+ continue
350
+
351
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
352
+ latency = time.perf_counter() - st
353
+ if chunk == "[DONE]":
354
+ pass
355
+ else:
356
+ data = json.loads(chunk)
357
+
358
+ # Check if this chunk contains content
359
+ delta = data.get("choices", [{}])[0].get("delta", {})
360
+ content = delta.get("content", "")
361
+
362
+ if content:
363
+ timestamp = time.perf_counter()
364
+ # First token
365
+ if ttft == 0.0:
366
+ ttft = timestamp - st
367
+ output.ttft = ttft
368
+
369
+ # Decoding phase
370
+ else:
371
+ output.itl.append(
372
+ timestamp - most_recent_timestamp
373
+ )
374
+
375
+ most_recent_timestamp = timestamp
376
+ generated_text += content
377
+
378
+ # Check for usage info in final chunk
379
+ output_len = (data.get("usage") or {}).get(
380
+ "completion_tokens", output_len
381
+ )
382
+
383
+ output.generated_text = generated_text
384
+ output.success = True
385
+ output.latency = latency
386
+ output.output_len = output_len
387
+ else:
388
+ output.error = response.reason or ""
389
+ output.success = False
390
+ except Exception:
391
+ output.success = False
392
+ exc_info = sys.exc_info()
393
+ output.error = "".join(traceback.format_exception(*exc_info))
394
+
395
+ if pbar:
396
+ pbar.update(1)
397
+ return output
398
+
399
+
268
400
  async def async_request_truss(
269
401
  request_func_input: RequestFuncInput,
270
402
  pbar: Optional[tqdm] = None,
@@ -544,6 +676,7 @@ def get_dataset(args, tokenizer):
544
676
  num_requests=args.num_prompts,
545
677
  tokenizer=tokenizer,
546
678
  fixed_output_len=args.random_output_len,
679
+ apply_chat_template=args.apply_chat_template,
547
680
  random_sample=True,
548
681
  )
549
682
  else:
@@ -555,8 +688,11 @@ ASYNC_REQUEST_FUNCS = {
555
688
  "sglang": async_request_sglang_generate,
556
689
  "sglang-native": async_request_sglang_generate,
557
690
  "sglang-oai": async_request_openai_completions,
691
+ "sglang-oai-chat": async_request_openai_chat_completions,
558
692
  "vllm": async_request_openai_completions,
693
+ "vllm-chat": async_request_openai_chat_completions,
559
694
  "lmdeploy": async_request_openai_completions,
695
+ "lmdeploy-chat": async_request_openai_chat_completions,
560
696
  "trt": async_request_trt_llm,
561
697
  "gserver": async_request_gserver,
562
698
  "truss": async_request_truss,
@@ -661,6 +797,7 @@ def sample_mmmu_requests(
661
797
  num_requests: int,
662
798
  tokenizer: PreTrainedTokenizerBase,
663
799
  fixed_output_len: Optional[int] = None,
800
+ apply_chat_template: bool = True,
664
801
  random_sample: bool = True,
665
802
  ) -> List[DatasetRow]:
666
803
  """
@@ -670,6 +807,7 @@ def sample_mmmu_requests(
670
807
  num_requests: Number of requests to sample.
671
808
  tokenizer: Tokenizer to use for token counting.
672
809
  fixed_output_len: If provided, use this fixed output length for all requests.
810
+ apply_chat_template: Whether to apply the chat template to the prompt.
673
811
  random_sample: Whether to randomly sample or take the first N.
674
812
 
675
813
  Returns:
@@ -739,28 +877,30 @@ def sample_mmmu_requests(
739
877
 
740
878
  # Construct the prompt
741
879
  prompt = f"Question: {question}\n\nAnswer: "
742
-
743
- try:
744
- prompt = tokenizer.apply_chat_template(
745
- [
746
- {
747
- "role": "user",
748
- "content": [
749
- {
750
- "type": "image_url",
751
- "image_url": {"url": image_data},
752
- },
753
- {"type": "text", "text": prompt},
754
- ],
755
- }
756
- ],
757
- add_generation_prompt=True,
758
- tokenize=False,
759
- )
760
- except Exception as e:
761
- # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
762
- print(f"Error applying chat template: {e}, fallback to <image> tag")
763
- prompt = f"<image>{prompt}"
880
+ if apply_chat_template:
881
+ try:
882
+ prompt = tokenizer.apply_chat_template(
883
+ [
884
+ {
885
+ "role": "user",
886
+ "content": [
887
+ {
888
+ "type": "image_url",
889
+ "image_url": {"url": image_data},
890
+ },
891
+ {"type": "text", "text": prompt},
892
+ ],
893
+ }
894
+ ],
895
+ add_generation_prompt=True,
896
+ tokenize=False,
897
+ )
898
+ except Exception as e:
899
+ # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
900
+ print(
901
+ f"Error applying chat template: {e}, fallback to <image> tag"
902
+ )
903
+ prompt = f"<image>{prompt}"
764
904
 
765
905
  # Calculate token lengths for text only (without image data)
766
906
  prompt_token_ids = tokenizer.encode(prompt)
@@ -1544,6 +1684,12 @@ def run_benchmark(args_: argparse.Namespace):
1544
1684
  if args.base_url
1545
1685
  else f"http://{args.host}:{args.port}/v1/completions"
1546
1686
  )
1687
+ elif args.backend in ["sglang-oai-chat", "vllm-chat", "lmdeploy-chat"]:
1688
+ api_url = (
1689
+ f"{args.base_url}/v1/chat/completions"
1690
+ if args.base_url
1691
+ else f"http://{args.host}:{args.port}/v1/chat/completions"
1692
+ )
1547
1693
  elif args.backend == "trt":
1548
1694
  api_url = (
1549
1695
  f"{args.base_url}/v2/models/ensemble/generate_stream"
@@ -147,12 +147,14 @@ class InternLM2Config(PretrainedConfig):
147
147
  )
148
148
  if (
149
149
  rope_scaling_factor is None
150
- or not isinstance(rope_scaling_factor, float)
150
+ or not isinstance(rope_scaling_factor, (float, int))
151
151
  or rope_scaling_factor < 1.0
152
152
  ):
153
153
  raise ValueError(
154
- f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}"
154
+ f"`rope_scaling`'s factor field must be a float|int >= 1, got {rope_scaling_factor=}, {type(rope_scaling_factor)=}"
155
155
  )
156
+ if isinstance(rope_scaling_factor, int):
157
+ rope_scaling_factor = float(rope_scaling_factor)
156
158
 
157
159
 
158
160
  class InternVisionConfig(PretrainedConfig):
@@ -19,7 +19,7 @@ from transformers import (
19
19
  from transformers.image_utils import to_numpy_array
20
20
 
21
21
  from sglang.srt.configs.utils import register_image_processor, register_processor
22
- from sglang.srt.mm_utils import expand2square
22
+ from sglang.srt.multimodal.mm_utils import expand2square
23
23
 
24
24
 
25
25
  class DictToObject(dict):
@@ -59,6 +59,7 @@ class ModelConfig:
59
59
  quantization: Optional[str] = None,
60
60
  override_config_file: Optional[str] = None,
61
61
  is_draft_model: bool = False,
62
+ hybrid_kvcache_ratio: Optional[float] = None,
62
63
  impl: Union[str, ModelImpl] = ModelImpl.AUTO,
63
64
  ) -> None:
64
65
 
@@ -86,6 +87,18 @@ class ModelConfig:
86
87
  self.attention_chunk_size = getattr(
87
88
  self.hf_text_config, "attention_chunk_size", None
88
89
  )
90
+ self.is_hybrid = is_hybrid_model(
91
+ self.hf_config.architectures,
92
+ hybrid_kvcache_ratio=hybrid_kvcache_ratio,
93
+ context_length=context_length,
94
+ attention_chunk_size=self.attention_chunk_size,
95
+ )
96
+ if self.is_hybrid is not None:
97
+ self.swa_attention_layer_ids, self.full_attention_layer_ids = (
98
+ get_hybrid_layer_ids(
99
+ self.hf_config.architectures, self.hf_text_config.num_hidden_layers
100
+ )
101
+ )
89
102
 
90
103
  if enable_multimodal is None:
91
104
  mm_disabled_models = [
@@ -264,6 +277,7 @@ class ModelConfig:
264
277
  enable_multimodal=server_args.enable_multimodal,
265
278
  dtype=server_args.dtype,
266
279
  quantization=server_args.quantization,
280
+ hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
267
281
  impl=server_args.impl,
268
282
  **kwargs,
269
283
  )
@@ -579,6 +593,7 @@ multimodal_model_archs = [
579
593
  "Mistral3ForConditionalGeneration",
580
594
  "MultiModalityCausalLM",
581
595
  "MllamaForConditionalGeneration",
596
+ "Qwen2AudioForConditionalGeneration",
582
597
  "Qwen2VLForConditionalGeneration",
583
598
  "Qwen2_5_VLForConditionalGeneration",
584
599
  "KimiVLForConditionalGeneration",
@@ -633,3 +648,36 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
633
648
  if scale <= 1:
634
649
  return 1.0
635
650
  return 0.1 * mscale * math.log(scale) + 1.0
651
+
652
+
653
+ def is_hybrid_model(
654
+ model_architectures: List[str],
655
+ hybrid_kvcache_ratio: Optional[float],
656
+ context_length: Optional[int],
657
+ attention_chunk_size: Optional[int],
658
+ ):
659
+ if hybrid_kvcache_ratio is None:
660
+ return None
661
+ elif (
662
+ hybrid_kvcache_ratio > 0
663
+ and model_architectures[0] == "Llama4ForConditionalGeneration"
664
+ and context_length > attention_chunk_size
665
+ ):
666
+ return hybrid_kvcache_ratio
667
+ else:
668
+ return None
669
+
670
+
671
+ def get_hybrid_layer_ids(model_architectures: List[str], num_hidden_layers: int):
672
+ if "Llama4ForConditionalGeneration" in model_architectures:
673
+ swa_attention_layer_ids = [
674
+ i for i in range(num_hidden_layers) if (i + 1) % 4 != 0
675
+ ]
676
+ full_attention_layer_ids = [
677
+ i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
678
+ ]
679
+ else:
680
+ raise ValueError(
681
+ "get_hybrid_layer_ids is only implemented for Llama4ForConditionalGeneration"
682
+ )
683
+ return swa_attention_layer_ids, full_attention_layer_ids