sglang 0.4.5__tar.gz → 0.4.5.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (618) hide show
  1. {sglang-0.4.5/sglang.egg-info → sglang-0.4.5.post2}/PKG-INFO +15 -5
  2. {sglang-0.4.5 → sglang-0.4.5.post2}/README.md +2 -2
  3. {sglang-0.4.5 → sglang-0.4.5.post2}/pyproject.toml +15 -3
  4. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/__init__.py +2 -4
  5. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/bench_one_batch.py +23 -2
  6. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/bench_serving.py +6 -4
  7. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/anthropic.py +0 -4
  8. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/base_backend.py +1 -1
  9. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/openai.py +1 -1
  10. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/vertexai.py +0 -1
  11. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/compiler.py +1 -7
  12. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/tracer.py +3 -7
  13. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/_custom_ops.py +0 -2
  14. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/model_config.py +37 -5
  15. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/constrained/base_grammar_backend.py +26 -5
  16. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/constrained/llguidance_backend.py +1 -0
  17. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/constrained/outlines_backend.py +1 -0
  18. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/constrained/outlines_jump_forward.py +14 -1
  19. sglang-0.4.5.post2/sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
  20. sglang-0.4.5.post2/sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
  21. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/constrained/xgrammar_backend.py +27 -4
  22. sglang-0.4.5.post2/sglang/srt/custom_op.py +44 -0
  23. sglang-0.4.5.post2/sglang/srt/disaggregation/base/__init__.py +8 -0
  24. sglang-0.4.5.post2/sglang/srt/disaggregation/base/conn.py +113 -0
  25. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/disaggregation/decode.py +80 -11
  26. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/disaggregation/mini_lb.py +58 -123
  27. sglang-0.4.5.post2/sglang/srt/disaggregation/mooncake/__init__.py +6 -0
  28. sglang-0.4.5.post2/sglang/srt/disaggregation/mooncake/conn.py +585 -0
  29. sglang-0.4.5.post2/sglang/srt/disaggregation/mooncake/transfer_engine.py +77 -0
  30. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/disaggregation/prefill.py +82 -22
  31. sglang-0.4.5.post2/sglang/srt/disaggregation/utils.py +90 -0
  32. sglang-0.4.5.post2/sglang/srt/entrypoints/EngineBase.py +53 -0
  33. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/entrypoints/engine.py +36 -8
  34. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/entrypoints/http_server.py +37 -8
  35. sglang-0.4.5.post2/sglang/srt/entrypoints/http_server_engine.py +142 -0
  36. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/entrypoints/verl_engine.py +42 -13
  37. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/hf_transformers_utils.py +4 -0
  38. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/activation.py +6 -8
  39. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/flashattention_backend.py +430 -257
  40. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/flashinfer_backend.py +18 -9
  41. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/torch_native_backend.py +6 -1
  42. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/triton_backend.py +6 -0
  43. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
  44. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/vision.py +1 -1
  45. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/dp_attention.py +2 -4
  46. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/elementwise.py +15 -2
  47. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/linear.py +18 -3
  48. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/ep_moe/layer.py +15 -29
  49. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
  50. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_native.py +4 -0
  51. sglang-0.4.5.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang-0.4.5.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  53. sglang-0.4.5.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang-0.4.5.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  55. sglang-0.4.5.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  56. sglang-0.4.5.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  57. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -34
  58. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  59. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/router.py +7 -1
  60. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/topk.py +63 -45
  61. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/parameter.py +0 -2
  62. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/__init__.py +13 -5
  63. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  64. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +12 -2
  65. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -77
  66. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
  67. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/fp8.py +131 -136
  68. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/fp8_kernel.py +328 -46
  69. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/fp8_utils.py +206 -253
  70. sglang-0.4.5.post2/sglang/srt/layers/quantization/kv_cache.py +89 -0
  71. sglang-0.4.5.post2/sglang/srt/layers/quantization/modelopt_quant.py +463 -0
  72. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/moe_wna16.py +2 -0
  73. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/utils.py +5 -11
  74. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/w8a8_fp8.py +156 -4
  75. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/w8a8_int8.py +8 -7
  76. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/radix_attention.py +28 -1
  77. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/rotary_embedding.py +15 -3
  78. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/sampler.py +5 -10
  79. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/backend/base_backend.py +18 -2
  80. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/backend/flashinfer_backend.py +1 -1
  81. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/backend/triton_backend.py +1 -1
  82. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/layers.py +1 -1
  83. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/lora.py +1 -1
  84. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/lora_manager.py +1 -1
  85. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/detokenizer_manager.py +0 -1
  86. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/io_struct.py +255 -97
  87. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/mm_utils.py +7 -5
  88. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processor.py +0 -2
  89. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/base_processor.py +117 -79
  90. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
  91. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
  92. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/schedule_batch.py +64 -25
  93. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/scheduler.py +80 -82
  94. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/tokenizer_manager.py +18 -3
  95. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/tp_worker.py +1 -0
  96. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/mem_cache/hiradix_cache.py +5 -1
  97. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/mem_cache/memory_pool.py +21 -3
  98. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/metrics/collector.py +9 -0
  99. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/model_executor/cuda_graph_runner.py +9 -6
  100. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/model_executor/forward_batch_info.py +234 -15
  101. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/model_executor/model_runner.py +67 -35
  102. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/model_loader/loader.py +31 -4
  103. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/model_loader/weight_utils.py +4 -2
  104. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/baichuan.py +2 -0
  105. sglang-0.4.5.post2/sglang/srt/models/bert.py +398 -0
  106. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/chatglm.py +1 -0
  107. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/commandr.py +1 -0
  108. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/dbrx.py +1 -0
  109. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/deepseek.py +2 -1
  110. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/deepseek_nextn.py +74 -70
  111. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/deepseek_v2.py +494 -366
  112. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/exaone.py +1 -0
  113. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/gemma.py +1 -0
  114. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/gemma2.py +1 -0
  115. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/gemma3_causal.py +1 -0
  116. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/gpt2.py +1 -0
  117. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/gpt_bigcode.py +1 -0
  118. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/granite.py +1 -0
  119. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/grok.py +1 -0
  120. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/internlm2.py +1 -0
  121. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/llama.py +6 -5
  122. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/llama4.py +101 -34
  123. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/minicpm.py +1 -0
  124. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/minicpm3.py +30 -200
  125. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/mixtral.py +1 -0
  126. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/mixtral_quant.py +1 -0
  127. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/mllama.py +51 -8
  128. sglang-0.4.5.post2/sglang/srt/models/mllama4.py +227 -0
  129. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/olmo.py +1 -0
  130. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/olmo2.py +1 -0
  131. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/olmoe.py +1 -0
  132. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/phi3_small.py +1 -0
  133. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/qwen.py +1 -0
  134. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/qwen2.py +5 -1
  135. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/qwen2_5_vl.py +35 -70
  136. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/qwen2_moe.py +15 -13
  137. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/qwen2_vl.py +27 -25
  138. sglang-0.4.5.post2/sglang/srt/models/qwen3.py +335 -0
  139. sglang-0.4.5.post2/sglang/srt/models/qwen3_moe.py +423 -0
  140. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/stablelm.py +1 -0
  141. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/xverse.py +1 -0
  142. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/xverse_moe.py +1 -0
  143. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/openai_api/adapter.py +4 -1
  144. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/patch_torch.py +11 -0
  145. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/reasoning_parser.py +0 -1
  146. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/sampling/sampling_batch_info.py +2 -3
  147. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/server_args.py +55 -19
  148. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
  149. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/speculative/eagle_utils.py +1 -11
  150. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/speculative/eagle_worker.py +10 -9
  151. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/utils.py +136 -10
  152. sglang-0.4.5.post2/sglang/test/attention/test_flashattn_backend.py +350 -0
  153. sglang-0.4.5.post2/sglang/test/attention/test_flashattn_mla_backend.py +285 -0
  154. sglang-0.4.5.post2/sglang/test/attention/test_prefix_chunk_info.py +224 -0
  155. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/runners.py +5 -1
  156. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/test_block_fp8.py +224 -0
  157. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/test_custom_ops.py +1 -1
  158. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/test_utils.py +19 -8
  159. sglang-0.4.5.post2/sglang/version.py +1 -0
  160. {sglang-0.4.5 → sglang-0.4.5.post2/sglang.egg-info}/PKG-INFO +15 -5
  161. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang.egg-info/SOURCES.txt +21 -6
  162. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang.egg-info/requires.txt +13 -2
  163. sglang-0.4.5/sglang/srt/custom_op.py +0 -106
  164. sglang-0.4.5/sglang/srt/disaggregation/conn.py +0 -81
  165. sglang-0.4.5/sglang/srt/disaggregation/utils.py +0 -44
  166. sglang-0.4.5/sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -146
  167. sglang-0.4.5/sglang/srt/layers/quantization/kv_cache.py +0 -98
  168. sglang-0.4.5/sglang/srt/layers/quantization/modelopt_quant.py +0 -196
  169. sglang-0.4.5/sglang/srt/lora/backend/__init__.py +0 -25
  170. sglang-0.4.5/sglang/srt/models/mllama4.py +0 -154
  171. sglang-0.4.5/sglang/srt/server.py +0 -18
  172. sglang-0.4.5/sglang/test/attention/__init__.py +0 -0
  173. sglang-0.4.5/sglang/test/attention/test_flashattn_backend.py +0 -312
  174. sglang-0.4.5/sglang/version.py +0 -1
  175. {sglang-0.4.5 → sglang-0.4.5.post2}/LICENSE +0 -0
  176. {sglang-0.4.5 → sglang-0.4.5.post2}/setup.cfg +0 -0
  177. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/api.py +0 -0
  178. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/bench_offline_throughput.py +0 -0
  179. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/bench_one_batch_server.py +0 -0
  180. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/check_env.py +0 -0
  181. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/global_config.py +0 -0
  182. {sglang-0.4.5/sglang/lang → sglang-0.4.5.post2/sglang/lang/backend}/__init__.py +0 -0
  183. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/litellm.py +0 -0
  184. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
  185. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/chat_template.py +0 -0
  186. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/choices.py +0 -0
  187. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/interpreter.py +0 -0
  188. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/ir.py +0 -0
  189. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/launch_server.py +0 -0
  190. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/llama3_eval.py +0 -0
  191. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/aio_rwlock.py +0 -0
  192. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/code_completion_parser.py +0 -0
  193. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/__init__.py +0 -0
  194. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/chatglm.py +0 -0
  195. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/dbrx.py +0 -0
  196. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/deepseekvl2.py +0 -0
  197. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/device_config.py +0 -0
  198. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/exaone.py +0 -0
  199. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/janus_pro.py +0 -0
  200. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/load_config.py +0 -0
  201. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/utils.py +0 -0
  202. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/connector/__init__.py +0 -0
  203. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/connector/base_connector.py +0 -0
  204. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/connector/redis.py +0 -0
  205. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/connector/s3.py +0 -0
  206. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/connector/serde/__init__.py +0 -0
  207. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/connector/serde/safe_serde.py +0 -0
  208. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/connector/serde/serde.py +0 -0
  209. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/connector/utils.py +0 -0
  210. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/conversation.py +0 -0
  211. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/__init__.py +0 -0
  212. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/communication_op.py +0 -0
  213. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  214. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  215. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  216. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  217. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  218. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  219. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  220. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  221. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/parallel_state.py +0 -0
  222. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/distributed/utils.py +0 -0
  223. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/function_call_parser.py +0 -0
  224. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  225. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  226. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  227. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
  228. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  229. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  230. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  231. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  232. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/attention/utils.py +0 -0
  233. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/layernorm.py +1 -1
  234. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/logits_processor.py +0 -0
  235. {sglang-0.4.5/sglang/lang/backend → sglang-0.4.5.post2/sglang/srt/layers/moe/ep_moe}/__init__.py +0 -0
  236. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  237. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  238. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  239. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  240. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  241. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  242. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  243. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  244. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  245. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  246. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  247. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  248. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  249. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  250. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  251. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  252. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  253. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  254. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  255. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  256. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  257. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  258. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  259. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  260. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  261. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  262. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  263. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  264. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  265. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  266. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  267. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  268. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  269. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  270. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  271. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  272. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  273. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  274. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  275. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  276. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  277. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  278. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  279. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  280. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  281. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  282. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  283. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  284. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  285. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  286. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  287. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  288. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  289. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  290. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  291. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  292. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  293. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  294. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  295. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  296. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  297. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  298. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  299. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  300. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  301. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  302. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  303. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  304. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  305. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  306. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  307. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  308. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  309. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  310. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  311. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  312. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  313. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  314. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  315. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  316. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  317. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  318. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  319. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  320. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  321. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  322. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  323. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  324. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  325. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  326. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  327. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  328. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  329. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  330. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  331. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  332. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  333. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  334. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  335. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  336. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  337. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  338. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  339. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  340. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  341. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  342. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  343. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  344. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  345. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  346. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  347. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  348. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  349. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  350. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  351. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  352. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  353. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  354. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  355. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  356. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  357. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  358. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  359. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  360. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  361. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  362. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  363. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  364. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  365. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  366. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/pooler.py +0 -0
  367. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/awq.py +0 -0
  368. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
  369. {sglang-0.4.5/sglang/srt/layers/moe/ep_moe → sglang-0.4.5.post2/sglang/srt/layers/quantization/compressed_tensors}/__init__.py +0 -0
  370. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  371. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  372. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  373. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  488. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  489. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  490. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  491. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  492. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  493. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  494. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  495. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  496. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  497. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  498. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  499. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  500. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  501. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  502. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  503. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  504. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  505. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  506. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  507. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  508. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  509. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  510. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  511. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  512. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  513. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  514. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  515. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  516. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  517. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  518. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  519. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  520. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  521. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  522. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  523. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  524. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  525. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/gptq.py +0 -0
  526. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  527. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  528. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  529. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  530. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/lora_config.py +0 -0
  531. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/mem_pool.py +0 -0
  532. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  533. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  534. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  535. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  536. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  537. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/lora/utils.py +0 -0
  538. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/cache_controller.py +0 -0
  539. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/configure_logging.py +0 -0
  540. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/data_parallel_controller.py +0 -0
  541. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/expert_distribution.py +0 -0
  542. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
  543. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +0 -0
  544. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/gemma3.py +0 -0
  545. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
  546. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/minicpm.py +0 -0
  547. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
  548. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/multimodal_processors/qwen_vl.py +0 -0
  549. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/schedule_policy.py +0 -0
  550. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
  551. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/session_controller.py +0 -0
  552. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  553. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/managers/utils.py +0 -0
  554. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  555. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  556. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  557. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/mem_cache/paged_allocator.py +0 -0
  558. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
  559. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/metrics/func_timer.py +0 -0
  560. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/mm_utils.py +0 -0
  561. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/model_loader/__init__.py +0 -0
  562. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/model_loader/utils.py +0 -0
  563. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/model_parallel.py +0 -0
  564. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/clip.py +0 -0
  565. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  566. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/deepseek_vl2.py +0 -0
  567. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/gemma2_reward.py +0 -0
  568. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/gemma3_mm.py +0 -0
  569. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/internlm2_reward.py +0 -0
  570. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/llama_classification.py +0 -0
  571. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/llama_eagle.py +0 -0
  572. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/llama_eagle3.py +0 -0
  573. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/llama_embedding.py +0 -0
  574. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/llama_reward.py +0 -0
  575. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/llava.py +0 -0
  576. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/llavavid.py +0 -0
  577. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/minicpmo.py +0 -0
  578. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/minicpmv.py +0 -0
  579. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/mistral.py +0 -0
  580. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/qwen2_classification.py +0 -0
  581. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/qwen2_eagle.py +0 -0
  582. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/qwen2_rm.py +0 -0
  583. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/registry.py +0 -0
  584. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/torch_native_llama.py +0 -0
  585. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/models/yivl.py +0 -0
  586. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/openai_api/protocol.py +0 -0
  587. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/platforms/interface.py +0 -0
  588. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  589. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  590. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  591. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  592. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  593. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  594. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/sampling/sampling_params.py +0 -0
  595. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  596. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/speculative/spec_info.py +0 -0
  597. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  598. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/warmup.py +0 -0
  599. {sglang-0.4.5/sglang/srt/layers/quantization/compressed_tensors → sglang-0.4.5.post2/sglang/test}/__init__.py +0 -0
  600. {sglang-0.4.5/sglang/test → sglang-0.4.5.post2/sglang/test/attention}/__init__.py +0 -0
  601. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/few_shot_gsm8k.py +0 -0
  602. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  603. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/run_eval.py +0 -0
  604. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/send_one.py +0 -0
  605. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/simple_eval_common.py +0 -0
  606. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  607. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  608. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/simple_eval_math.py +0 -0
  609. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  610. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  611. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/test_activation.py +0 -0
  612. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/test_block_fp8_ep.py +0 -0
  613. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/test_dynamic_grad_mode.py +0 -0
  614. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/test_layernorm.py +0 -0
  615. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/test/test_programs.py +0 -0
  616. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang/utils.py +0 -0
  617. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang.egg-info/dependency_links.txt +0 -0
  618. {sglang-0.4.5 → sglang-0.4.5.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.5
3
+ Version: 0.4.5.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,20 +239,30 @@ Requires-Dist: python-multipart; extra == "runtime-common"
239
239
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
240
240
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
241
241
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
242
- Requires-Dist: transformers==4.51.0; extra == "runtime-common"
242
+ Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
245
  Requires-Dist: compressed-tensors; extra == "runtime-common"
246
246
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
247
247
  Provides-Extra: srt
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt"
249
- Requires-Dist: sgl-kernel==0.0.8; extra == "srt"
249
+ Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
250
250
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
251
251
  Requires-Dist: torch==2.5.1; extra == "srt"
252
+ Requires-Dist: torchvision==0.20.1; extra == "srt"
252
253
  Requires-Dist: cuda-python; extra == "srt"
253
254
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
254
255
  Requires-Dist: partial_json_parser; extra == "srt"
255
256
  Requires-Dist: einops; extra == "srt"
257
+ Provides-Extra: blackwell
258
+ Requires-Dist: sglang[runtime_common]; extra == "blackwell"
259
+ Requires-Dist: sgl-kernel; extra == "blackwell"
260
+ Requires-Dist: torch; extra == "blackwell"
261
+ Requires-Dist: torchvision; extra == "blackwell"
262
+ Requires-Dist: cuda-python; extra == "blackwell"
263
+ Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
264
+ Requires-Dist: partial_json_parser; extra == "blackwell"
265
+ Requires-Dist: einops; extra == "blackwell"
256
266
  Provides-Extra: srt-hip
257
267
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
258
268
  Requires-Dist: torch; extra == "srt-hip"
@@ -371,7 +381,7 @@ SGLang is a fast serving framework for large language models and vision language
371
381
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
372
382
  The core features include:
373
383
 
374
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
384
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
375
385
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
376
386
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
377
387
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -391,7 +401,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
391
401
 
392
402
  ## Adoption and Sponsorship
393
403
  The project has been deployed to large-scale production, generating trillions of tokens every day.
394
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
404
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
395
405
 
396
406
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
397
407
 
@@ -43,7 +43,7 @@ SGLang is a fast serving framework for large language models and vision language
43
43
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
44
44
  The core features include:
45
45
 
46
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
46
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
47
47
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
48
48
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
49
49
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -63,7 +63,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
63
63
 
64
64
  ## Adoption and Sponsorship
65
65
  The project has been deployed to large-scale production, generating trillions of tokens every day.
66
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
66
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
67
67
 
68
68
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
69
69
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.5"
7
+ version = "0.4.5.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -38,7 +38,7 @@ runtime_common = [
38
38
  "pyzmq>=25.1.2",
39
39
  "soundfile==0.13.1",
40
40
  "torchao>=0.7.0",
41
- "transformers==4.51.0",
41
+ "transformers==4.51.1",
42
42
  "uvicorn",
43
43
  "uvloop",
44
44
  "compressed-tensors",
@@ -47,9 +47,21 @@ runtime_common = [
47
47
 
48
48
  srt = [
49
49
  "sglang[runtime_common]",
50
- "sgl-kernel==0.0.8",
50
+ "sgl-kernel==0.0.9.post2",
51
51
  "flashinfer_python==0.2.3",
52
52
  "torch==2.5.1",
53
+ "torchvision==0.20.1",
54
+ "cuda-python",
55
+ "outlines>=0.0.44,<=0.1.11",
56
+ "partial_json_parser",
57
+ "einops",
58
+ ]
59
+
60
+ blackwell = [
61
+ "sglang[runtime_common]",
62
+ "sgl-kernel",
63
+ "torch",
64
+ "torchvision",
53
65
  "cuda-python",
54
66
  "outlines>=0.0.44,<=0.1.11",
55
67
  "partial_json_parser",
@@ -24,6 +24,7 @@ from sglang.api import (
24
24
  user_end,
25
25
  video,
26
26
  )
27
+ from sglang.global_config import global_config
27
28
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
28
29
  from sglang.lang.choices import (
29
30
  greedy_token_selection,
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
31
32
  unconditional_likelihood_normalized,
32
33
  )
33
34
  from sglang.utils import LazyImport
35
+ from sglang.version import __version__
34
36
 
35
37
  ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
36
38
  Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
@@ -38,10 +40,6 @@ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
38
40
  OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
39
41
  VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
40
42
 
41
- # Other configs
42
- from sglang.global_config import global_config
43
- from sglang.version import __version__
44
-
45
43
  __all__ = [
46
44
  "Engine",
47
45
  "Runtime",
@@ -60,6 +60,7 @@ from sglang.srt.configs.model_config import ModelConfig
60
60
  from sglang.srt.entrypoints.engine import _set_envs_and_config
61
61
  from sglang.srt.hf_transformers_utils import get_tokenizer
62
62
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
63
+ from sglang.srt.managers.scheduler import Scheduler
63
64
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
64
65
  from sglang.srt.model_executor.model_runner import ModelRunner
65
66
  from sglang.srt.sampling.sampling_params import SamplingParams
@@ -135,6 +136,7 @@ def load_model(server_args, port_args, tp_rank):
135
136
  context_length=server_args.context_length,
136
137
  model_override_args=server_args.json_model_override_args,
137
138
  is_embedding=server_args.is_embedding,
139
+ enable_multimodal=server_args.enable_multimodal,
138
140
  dtype=server_args.dtype,
139
141
  quantization=server_args.quantization,
140
142
  )
@@ -184,6 +186,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
184
186
  req.prefix_indices = []
185
187
  req.fill_ids = req.origin_input_ids
186
188
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
189
+ req.logprob_start_len = len(req.origin_input_ids) - 1
187
190
  reqs.append(req)
188
191
 
189
192
  return input_ids, reqs
@@ -199,11 +202,12 @@ def prepare_extend_inputs_for_correctness_test(
199
202
  i, : bench_args.cut_len
200
203
  ]
201
204
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
205
+ req.logprob_start_len = len(req.origin_input_ids) - 1
202
206
  return reqs
203
207
 
204
208
 
205
209
  def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
206
- input_ids = np.ones((batch_size, input_len), dtype=np.int32)
210
+ input_ids = np.random.randint(0, 10000, (batch_size, input_len), dtype=np.int32)
207
211
  sampling_params = SamplingParams(
208
212
  temperature=0,
209
213
  max_new_tokens=BenchArgs.output_len,
@@ -220,6 +224,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
220
224
  req.prefix_indices = []
221
225
  req.fill_ids = req.origin_input_ids
222
226
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
227
+ req.logprob_start_len = len(req.origin_input_ids) - 1
223
228
  reqs.append(req)
224
229
 
225
230
  return reqs
@@ -238,6 +243,7 @@ def extend(reqs, model_runner):
238
243
  enable_custom_logit_processor=False,
239
244
  )
240
245
  batch.prepare_for_extend()
246
+ _maybe_prepare_dp_attn_batch(batch, model_runner)
241
247
  model_worker_batch = batch.get_model_worker_batch()
242
248
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
243
249
  logits_output = model_runner.forward(forward_batch)
@@ -249,6 +255,7 @@ def extend(reqs, model_runner):
249
255
  def decode(input_token_ids, batch, model_runner):
250
256
  batch.output_ids = input_token_ids
251
257
  batch.prepare_for_decode()
258
+ _maybe_prepare_dp_attn_batch(batch, model_runner)
252
259
  model_worker_batch = batch.get_model_worker_batch()
253
260
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
254
261
  logits_output = model_runner.forward(forward_batch)
@@ -256,6 +263,20 @@ def decode(input_token_ids, batch, model_runner):
256
263
  return next_token_ids, logits_output.next_token_logits
257
264
 
258
265
 
266
+ def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
267
+ if model_runner.server_args.enable_dp_attention:
268
+ Scheduler.prepare_dp_attn_batch_raw(
269
+ batch,
270
+ dp_size=model_runner.server_args.dp_size,
271
+ attn_tp_size=1,
272
+ tp_cpu_group=model_runner.tp_group.cpu_group,
273
+ get_idle_batch=None,
274
+ disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
275
+ spec_algorithm=SpeculativeAlgorithm.NONE,
276
+ speculative_num_draft_tokens=None,
277
+ )
278
+
279
+
259
280
  def correctness_test(
260
281
  server_args,
261
282
  port_args,
@@ -375,7 +396,7 @@ def latency_test_run_once(
375
396
  decode_latencies.append(latency)
376
397
  if i < 5:
377
398
  rank_print(
378
- f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
399
+ f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
379
400
  )
380
401
 
381
402
  if profile:
@@ -490,7 +490,7 @@ def get_dataset(args, tokenizer):
490
490
  prompt_suffix=args.prompt_suffix,
491
491
  apply_chat_template=args.apply_chat_template,
492
492
  )
493
- elif args.dataset_name == "random":
493
+ elif args.dataset_name.startswith("random"):
494
494
  input_requests = sample_random_requests(
495
495
  input_len=args.random_input_len,
496
496
  output_len=args.random_output_len,
@@ -498,6 +498,7 @@ def get_dataset(args, tokenizer):
498
498
  range_ratio=args.random_range_ratio,
499
499
  tokenizer=tokenizer,
500
500
  dataset_path=args.dataset_path,
501
+ random_sample=args.dataset_name == "random",
501
502
  )
502
503
  elif args.dataset_name == "generated-shared-prefix":
503
504
  input_requests = sample_generated_shared_prefix_requests(
@@ -687,6 +688,7 @@ def sample_random_requests(
687
688
  range_ratio: float,
688
689
  tokenizer: PreTrainedTokenizerBase,
689
690
  dataset_path: str,
691
+ random_sample: bool = True,
690
692
  ) -> List[Tuple[str, int, int]]:
691
693
 
692
694
  input_lens = np.random.randint(
@@ -700,7 +702,7 @@ def sample_random_requests(
700
702
  size=num_prompts,
701
703
  )
702
704
 
703
- if True:
705
+ if random_sample:
704
706
  # Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
705
707
 
706
708
  # Download sharegpt if necessary
@@ -1223,7 +1225,7 @@ async def benchmark(
1223
1225
  output_file_name = args.output_file
1224
1226
  else:
1225
1227
  now = datetime.now().strftime("%m%d")
1226
- if args.dataset_name == "random":
1228
+ if args.dataset_name.startswith("random"):
1227
1229
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
1228
1230
  else:
1229
1231
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
@@ -1442,7 +1444,7 @@ if __name__ == "__main__":
1442
1444
  "--dataset-name",
1443
1445
  type=str,
1444
1446
  default="sharegpt",
1445
- choices=["sharegpt", "random", "generated-shared-prefix"],
1447
+ choices=["sharegpt", "random", "random-ids", "generated-shared-prefix"],
1446
1448
  help="Name of the dataset to benchmark on.",
1447
1449
  )
1448
1450
  parser.add_argument(
@@ -1,7 +1,3 @@
1
- from typing import List, Optional, Union
2
-
3
- import numpy as np
4
-
5
1
  from sglang.lang.backend.base_backend import BaseBackend
6
2
  from sglang.lang.chat_template import get_chat_template
7
3
  from sglang.lang.interpreter import StreamExecutor
@@ -1,4 +1,4 @@
1
- from typing import Callable, List, Optional, Union
1
+ from typing import List, Optional, Union
2
2
 
3
3
  from sglang.lang.chat_template import get_chat_template
4
4
  from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
@@ -2,7 +2,7 @@ import dataclasses
2
2
  import logging
3
3
  import time
4
4
  import warnings
5
- from typing import Callable, List, Optional, Union
5
+ from typing import List, Optional, Union
6
6
 
7
7
  import numpy as np
8
8
 
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import warnings
3
- from typing import Optional
4
3
 
5
4
  from sglang.lang.backend.base_backend import BaseBackend
6
5
  from sglang.lang.chat_template import get_chat_template
@@ -5,13 +5,7 @@ from typing import List, Union
5
5
 
6
6
  from sglang.global_config import global_config
7
7
  from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program
8
- from sglang.lang.ir import (
9
- SglArgument,
10
- SglConstantText,
11
- SglExpr,
12
- SglSamplingParams,
13
- SglVariable,
14
- )
8
+ from sglang.lang.ir import SglArgument, SglExpr, SglSamplingParams, SglVariable
15
9
 
16
10
 
17
11
  def compile_func(function, backend):
@@ -1,20 +1,16 @@
1
1
  """Tracing a program."""
2
2
 
3
3
  import uuid
4
- from typing import Any, Callable, Dict, List, Optional, Union
4
+ from typing import Any, Dict, List, Optional
5
5
 
6
- from sglang.global_config import global_config
7
6
  from sglang.lang.backend.base_backend import BaseBackend
8
7
  from sglang.lang.interpreter import ProgramState, ProgramStateGroup
9
8
  from sglang.lang.ir import (
10
9
  SglArgument,
11
- SglCommitLazy,
12
- SglConcateAndAppend,
13
10
  SglConstantText,
14
11
  SglExpr,
15
12
  SglExprList,
16
13
  SglFork,
17
- SglFunction,
18
14
  SglGen,
19
15
  SglGetForkItem,
20
16
  SglRoleBegin,
@@ -230,8 +226,8 @@ class TracerProgramState(ProgramState):
230
226
  self.cur_role = None
231
227
 
232
228
  def _execute_var_scope_end(self, expr: SglVarScopeEnd):
233
- new_node = SglVariable(name, source=self.last_node)
234
- self.variables[name] = new_node
229
+ new_node = SglVariable(expr.name, source=self.last_node)
230
+ self.variables[expr.name] = new_node
235
231
 
236
232
  def get_var(self, name):
237
233
  ret = self.arguments.get(name, None)
@@ -1,10 +1,8 @@
1
1
  # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
2
2
  import logging
3
- import os
4
3
  from typing import List, Tuple
5
4
 
6
5
  import torch
7
- import torch.library
8
6
 
9
7
  from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
10
8
 
@@ -15,6 +15,7 @@
15
15
  import json
16
16
  import logging
17
17
  import math
18
+ import os
18
19
  from enum import IntEnum, auto
19
20
  from typing import List, Optional, Set, Union
20
21
 
@@ -42,10 +43,12 @@ class ModelConfig:
42
43
  context_length: Optional[int] = None,
43
44
  model_override_args: Optional[str] = None,
44
45
  is_embedding: Optional[bool] = None,
46
+ enable_multimodal: Optional[bool] = None,
45
47
  dtype: str = "auto",
46
48
  quantization: Optional[str] = None,
47
49
  override_config_file: Optional[str] = None,
48
50
  ) -> None:
51
+
49
52
  self.model_path = model_path
50
53
  self.revision = revision
51
54
  self.quantization = quantization
@@ -69,14 +72,28 @@ class ModelConfig:
69
72
  self.hf_text_config, "attention_chunk_size", None
70
73
  )
71
74
 
75
+ if enable_multimodal is None:
76
+ if self.hf_config.architectures == "Llama4ForConditionalGeneration":
77
+ enable_multimodal = False
78
+ else:
79
+ enable_multimodal = True
80
+
72
81
  # Check model type
73
82
  self.is_generation = is_generation_model(
74
83
  self.hf_config.architectures, is_embedding
75
84
  )
76
- self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
77
- self.is_multimodal_gen = is_multimodal_gen_model(self.hf_config.architectures)
78
- self.is_image_gen = is_image_gen_model(self.hf_config.architectures)
79
- self.is_audio_model = is_audio_model(self.hf_config.architectures)
85
+ self.is_multimodal = enable_multimodal and is_multimodal_model(
86
+ self.hf_config.architectures
87
+ )
88
+ self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
89
+ self.hf_config.architectures
90
+ )
91
+ self.is_image_gen = enable_multimodal and is_image_gen_model(
92
+ self.hf_config.architectures
93
+ )
94
+ self.is_audio_model = enable_multimodal and is_audio_model(
95
+ self.hf_config.architectures
96
+ )
80
97
  self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
81
98
  self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
82
99
 
@@ -234,6 +251,20 @@ class ModelConfig:
234
251
  if quant_cfg is None:
235
252
  # compressed-tensors uses a "compression_config" key
236
253
  quant_cfg = getattr(self.hf_config, "compression_config", None)
254
+ if quant_cfg is None:
255
+ # check if is modelopt model -- modelopt doesn't have corresponding field
256
+ # in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
257
+ # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
258
+ is_local = os.path.exists(self.model_path)
259
+ modelopt_quant_config = {"quant_method": "modelopt"}
260
+ if not is_local:
261
+ from huggingface_hub import HfApi
262
+
263
+ hf_api = HfApi()
264
+ if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
265
+ quant_cfg = modelopt_quant_config
266
+ elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
267
+ quant_cfg = modelopt_quant_config
237
268
  return quant_cfg
238
269
 
239
270
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -264,6 +295,7 @@ class ModelConfig:
264
295
  "moe_wna16",
265
296
  ]
266
297
  compatible_quantization_methods = {
298
+ "modelopt_fp4": ["modelopt"],
267
299
  "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
268
300
  "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
269
301
  }
@@ -470,8 +502,8 @@ multimodal_model_archs = [
470
502
  "Gemma3ForConditionalGeneration",
471
503
  "Grok1VForCausalLM",
472
504
  "Grok1AForCausalLM",
473
- # TODO: add multimodal support for "Llama4ForConditionalGeneration",
474
505
  "LlavaLlamaForCausalLM",
506
+ "Llama4ForConditionalGeneration",
475
507
  "LlavaMistralForCausalLM",
476
508
  "LlavaQwenForCausalLM",
477
509
  "LlavaVidForCausalLM",
@@ -28,6 +28,18 @@ logger = logging.getLogger(__name__)
28
28
 
29
29
 
30
30
  class BaseGrammarObject(ABC):
31
+
32
+ def __init__(self):
33
+ self._finished = False
34
+
35
+ @property
36
+ def finished(self):
37
+ return self._finished
38
+
39
+ @finished.setter
40
+ def finished(self, finished):
41
+ self._finished = finished
42
+
31
43
  @abstractmethod
32
44
  def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
33
45
  """
@@ -59,6 +71,13 @@ class BaseGrammarObject(ABC):
59
71
  """
60
72
  raise NotImplementedError
61
73
 
74
+ @abstractmethod
75
+ def accept_token(self, token: int) -> None:
76
+ """
77
+ Accept a token in the grammar.
78
+ """
79
+ raise NotImplementedError
80
+
62
81
  @abstractmethod
63
82
  def allocate_vocab_mask(
64
83
  self, vocab_size: int, batch_size: int, device
@@ -90,7 +109,7 @@ class CacheEntry:
90
109
  event: Event
91
110
 
92
111
 
93
- class BaseGrammarBackend(ABC):
112
+ class BaseGrammarBackend:
94
113
  def __init__(self):
95
114
  self.executor = ThreadPoolExecutor()
96
115
  self.cache: Dict[Tuple[str, str], CacheEntry] = {}
@@ -107,19 +126,15 @@ class BaseGrammarBackend(ABC):
107
126
  """
108
127
  raise ValueError(f"Invalid key_type: {key_type}={key_string}")
109
128
 
110
- @abstractmethod
111
129
  def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]:
112
130
  return self._not_supported("json", key_string)
113
131
 
114
- @abstractmethod
115
132
  def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]:
116
133
  return self._not_supported("regex", key_string)
117
134
 
118
- @abstractmethod
119
135
  def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
120
136
  return self._not_supported("ebnf", key_string)
121
137
 
122
- @abstractmethod
123
138
  def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
124
139
  return self._not_supported("structural_tag", key_string)
125
140
 
@@ -195,4 +210,10 @@ def create_grammar_backend(
195
210
  else:
196
211
  raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
197
212
 
213
+ if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"):
214
+ from .reasoner_grammar_backend import ReasonerGrammarBackend
215
+
216
+ grammar_backend = ReasonerGrammarBackend(
217
+ grammar_backend, tokenizer.think_end_id
218
+ )
198
219
  return grammar_backend
@@ -33,6 +33,7 @@ class GuidanceGrammar(BaseGrammarObject):
33
33
  def __init__(
34
34
  self, llguidance_tokenizer: llguidance.LLTokenizer, serialized_grammar: str
35
35
  ):
36
+ super().__init__()
36
37
  self.llguidance_tokenizer = llguidance_tokenizer
37
38
  self.serialized_grammar = serialized_grammar
38
39
 
@@ -44,6 +44,7 @@ class OutlinesGrammar(BaseGrammarObject):
44
44
  guide: RegexGuide,
45
45
  jump_forward_map: Union[OutlinesJumpForwardMap, None],
46
46
  ) -> None:
47
+ super().__init__()
47
48
  self.guide = guide
48
49
  self.jump_forward_map = jump_forward_map
49
50
  self.state = 0
@@ -19,10 +19,13 @@ Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
19
19
  import dataclasses
20
20
  import logging
21
21
  from collections import defaultdict
22
+ from typing import Optional
22
23
 
23
24
  import interegular
24
25
  from interegular import InvalidSyntax
25
- from outlines.caching import cache as disk_cache
26
+ from outlines.caching import cache
27
+
28
+ from sglang.srt.utils import get_bool_env_var
26
29
 
27
30
  try:
28
31
  # outlines >= 0.1.0
@@ -34,6 +37,9 @@ except ImportError:
34
37
 
35
38
  IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
36
39
 
40
+ # Env var was set in sglang.srt.server_args.ServerArgs.__post__init__
41
+ DISABLE_DISK_CACHE = get_bool_env_var("SGLANG_DISABLE_OUTLINES_DISK_CACHE", "true")
42
+
37
43
  logger = logging.getLogger(__name__)
38
44
 
39
45
 
@@ -45,6 +51,13 @@ class JumpEdge:
45
51
  byte_next_state: int = None
46
52
 
47
53
 
54
+ def disk_cache(expire: Optional[float] = None, typed=False, ignore=()):
55
+ if not DISABLE_DISK_CACHE:
56
+ return cache(expire, typed, ignore)
57
+ else:
58
+ return lambda fn: None
59
+
60
+
48
61
  @disk_cache()
49
62
  def init_state_to_jump_forward(regex_string):
50
63
  try: