sglang 0.4.1.post6__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. {sglang-0.4.1.post6/sglang.egg-info → sglang-0.4.2}/PKG-INFO +21 -10
  2. {sglang-0.4.1.post6 → sglang-0.4.2}/README.md +6 -6
  3. {sglang-0.4.1.post6 → sglang-0.4.2}/pyproject.toml +9 -3
  4. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/__init__.py +21 -23
  5. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/api.py +2 -7
  6. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/bench_offline_throughput.py +41 -27
  7. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/bench_one_batch.py +60 -4
  8. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/bench_one_batch_server.py +1 -1
  9. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/bench_serving.py +83 -71
  10. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/backend/runtime_endpoint.py +183 -4
  11. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/chat_template.py +46 -4
  12. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/launch_server.py +1 -1
  13. sglang-0.4.2/sglang/srt/_custom_ops.py +156 -0
  14. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/configs/device_config.py +1 -1
  15. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/configs/load_config.py +1 -0
  16. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/configs/model_config.py +1 -0
  17. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/constrained/base_grammar_backend.py +21 -0
  18. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/constrained/xgrammar_backend.py +8 -4
  19. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/conversation.py +14 -1
  20. sglang-0.4.2/sglang/srt/distributed/__init__.py +3 -0
  21. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/communication_op.py +2 -1
  22. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
  23. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
  24. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  25. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
  26. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/device_communicators/pynccl.py +80 -1
  27. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
  28. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
  29. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
  30. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/parallel_state.py +1 -1
  31. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/distributed/utils.py +2 -1
  32. sglang-0.4.2/sglang/srt/entrypoints/engine.py +452 -0
  33. sglang-0.4.2/sglang/srt/entrypoints/http_server.py +603 -0
  34. sglang-0.4.2/sglang/srt/function_call_parser.py +494 -0
  35. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/activation.py +8 -8
  36. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/attention/flashinfer_backend.py +10 -9
  37. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/attention/triton_backend.py +4 -6
  38. sglang-0.4.2/sglang/srt/layers/attention/vision.py +204 -0
  39. sglang-0.4.2/sglang/srt/layers/dp_attention.py +71 -0
  40. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/layernorm.py +5 -5
  41. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/linear.py +65 -14
  42. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/logits_processor.py +49 -64
  43. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/ep_moe/layer.py +24 -16
  44. sglang-0.4.2/sglang/srt/layers/moe/fused_moe_native.py +129 -0
  45. sglang-0.4.2/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  46. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
  47. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
  48. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/parameter.py +18 -8
  49. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/__init__.py +20 -23
  50. sglang-0.4.2/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  51. sglang-0.4.2/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  52. sglang-0.4.2/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  53. sglang-0.4.2/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  54. sglang-0.4.2/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  55. sglang-0.4.2/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  56. sglang-0.4.2/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  57. sglang-0.4.2/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  58. sglang-0.4.2/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  59. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/fp8.py +10 -4
  60. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/modelopt_quant.py +1 -2
  61. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/w8a8_int8.py +1 -1
  62. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/radix_attention.py +2 -2
  63. sglang-0.4.2/sglang/srt/layers/rotary_embedding.py +1265 -0
  64. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/sampler.py +64 -6
  65. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/torchao_utils.py +12 -6
  66. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/vocab_parallel_embedding.py +2 -2
  67. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/lora/lora.py +1 -9
  68. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/configure_logging.py +3 -0
  69. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/data_parallel_controller.py +79 -72
  70. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/detokenizer_manager.py +24 -6
  71. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/image_processor.py +158 -2
  72. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/io_struct.py +57 -3
  73. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/schedule_batch.py +78 -45
  74. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/schedule_policy.py +26 -12
  75. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/scheduler.py +326 -201
  76. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/session_controller.py +1 -0
  77. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/tokenizer_manager.py +210 -121
  78. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/tp_worker.py +6 -4
  79. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
  80. sglang-0.4.2/sglang/srt/managers/utils.py +44 -0
  81. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/mem_cache/memory_pool.py +10 -32
  82. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/metrics/collector.py +15 -6
  83. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/model_executor/cuda_graph_runner.py +26 -30
  84. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/model_executor/forward_batch_info.py +5 -7
  85. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/model_executor/model_runner.py +44 -19
  86. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/model_loader/loader.py +83 -6
  87. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/model_loader/weight_utils.py +145 -6
  88. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/baichuan.py +6 -6
  89. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/chatglm.py +2 -2
  90. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/commandr.py +17 -5
  91. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/dbrx.py +13 -5
  92. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/deepseek.py +3 -3
  93. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/deepseek_v2.py +11 -11
  94. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/exaone.py +2 -2
  95. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/gemma.py +2 -2
  96. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/gemma2.py +15 -25
  97. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/gpt2.py +3 -5
  98. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/gpt_bigcode.py +1 -1
  99. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/granite.py +2 -2
  100. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/grok.py +4 -3
  101. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/internlm2.py +2 -2
  102. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/llama.py +7 -5
  103. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/minicpm.py +2 -2
  104. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/minicpm3.py +9 -9
  105. sglang-0.4.2/sglang/srt/models/minicpmv.py +1238 -0
  106. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/mixtral.py +3 -3
  107. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/mixtral_quant.py +3 -3
  108. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/mllama.py +2 -2
  109. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/olmo.py +3 -3
  110. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/olmo2.py +4 -4
  111. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/olmoe.py +7 -13
  112. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/phi3_small.py +2 -2
  113. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/qwen.py +2 -2
  114. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/qwen2.py +41 -4
  115. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/qwen2_moe.py +3 -3
  116. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/qwen2_vl.py +22 -122
  117. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/stablelm.py +2 -2
  118. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/torch_native_llama.py +20 -7
  119. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/xverse.py +6 -6
  120. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/xverse_moe.py +6 -6
  121. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/openai_api/adapter.py +139 -37
  122. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/openai_api/protocol.py +7 -4
  123. sglang-0.4.2/sglang/srt/sampling/custom_logit_processor.py +38 -0
  124. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
  125. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/sampling/sampling_batch_info.py +143 -18
  126. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/sampling/sampling_params.py +3 -1
  127. sglang-0.4.1.post6/sglang/srt/constrained/__init__.py → sglang-0.4.2/sglang/srt/server.py +4 -2
  128. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/server_args.py +77 -15
  129. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/speculative/eagle_utils.py +37 -15
  130. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/speculative/eagle_worker.py +11 -13
  131. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/utils.py +164 -129
  132. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/runners.py +8 -13
  133. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/test_programs.py +2 -1
  134. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/test_utils.py +83 -22
  135. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/utils.py +12 -2
  136. sglang-0.4.2/sglang/version.py +1 -0
  137. {sglang-0.4.1.post6 → sglang-0.4.2/sglang.egg-info}/PKG-INFO +21 -10
  138. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang.egg-info/SOURCES.txt +18 -3
  139. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang.egg-info/requires.txt +17 -3
  140. sglang-0.4.1.post6/sglang/launch_server_llavavid.py +0 -25
  141. sglang-0.4.1.post6/sglang/srt/_custom_ops.py +0 -118
  142. sglang-0.4.1.post6/sglang/srt/distributed/__init__.py +0 -3
  143. sglang-0.4.1.post6/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  144. sglang-0.4.1.post6/sglang/srt/layers/moe/fused_moe_native.py +0 -46
  145. sglang-0.4.1.post6/sglang/srt/layers/rotary_embedding.py +0 -112
  146. sglang-0.4.1.post6/sglang/srt/server.py +0 -1104
  147. sglang-0.4.1.post6/sglang/version.py +0 -1
  148. {sglang-0.4.1.post6 → sglang-0.4.2}/LICENSE +0 -0
  149. {sglang-0.4.1.post6 → sglang-0.4.2}/setup.cfg +0 -0
  150. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/bench_latency.py +0 -0
  151. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/check_env.py +0 -0
  152. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/global_config.py +0 -0
  153. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/__init__.py +0 -0
  154. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/backend/__init__.py +0 -0
  155. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/backend/anthropic.py +0 -0
  156. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/backend/base_backend.py +0 -0
  157. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/backend/litellm.py +0 -0
  158. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/backend/openai.py +0 -0
  159. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/backend/vertexai.py +0 -0
  160. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/choices.py +0 -0
  161. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/compiler.py +0 -0
  162. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/interpreter.py +0 -0
  163. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/ir.py +0 -0
  164. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/lang/tracer.py +0 -0
  165. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/llama3_eval.py +0 -0
  166. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/aio_rwlock.py +0 -0
  167. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/configs/__init__.py +0 -0
  168. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/configs/chatglm.py +0 -0
  169. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/configs/dbrx.py +0 -0
  170. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/configs/exaone.py +0 -0
  171. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/configs/qwen2vl.py +0 -0
  172. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/constrained/outlines_backend.py +0 -0
  173. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  174. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/hf_transformers_utils.py +0 -0
  175. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/attention/__init__.py +0 -0
  176. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  177. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  178. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  179. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  180. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  181. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  182. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/custom_op_util.py +0 -0
  183. {sglang-0.4.1.post6/sglang/srt/distributed/device_communicators → sglang-0.4.2/sglang/srt/layers/moe/ep_moe}/__init__.py +0 -0
  184. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  185. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  186. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  187. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  188. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  189. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  190. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  191. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  192. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  193. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  194. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  195. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  196. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  197. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  198. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  199. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  200. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  201. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  202. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  203. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  204. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  205. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  206. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  207. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  208. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  209. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  210. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  211. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  212. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  213. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  214. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  215. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  216. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  217. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  218. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  219. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  220. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  221. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  222. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  223. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  224. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  225. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  226. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  227. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  228. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  229. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  230. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  231. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  232. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  233. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  234. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  235. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  236. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  237. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  238. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  239. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  240. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  241. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  242. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  243. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  244. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  245. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  246. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  247. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  248. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  249. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  250. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  251. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  252. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  253. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  254. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  255. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  256. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  257. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  258. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  259. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  260. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  261. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  262. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  263. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  264. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  265. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  266. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  267. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  268. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  269. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  270. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  271. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  272. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  273. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  274. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  275. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  276. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  277. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/moe/topk.py +0 -0
  278. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/pooler.py +0 -0
  279. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/base_config.py +0 -0
  280. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  281. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  282. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  283. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  284. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  285. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  286. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  287. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  288. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  289. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  290. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  291. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  292. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  293. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  294. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  295. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  296. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  297. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  298. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  299. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  300. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  301. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  302. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  303. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  304. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  305. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  306. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  307. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  308. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  309. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  310. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  311. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  312. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  313. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  314. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  315. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  316. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  317. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  318. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  319. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  320. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  321. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/lora/lora_config.py +0 -0
  322. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/lora/lora_manager.py +0 -0
  323. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/managers/cache_controller.py +0 -0
  324. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  325. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  326. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  327. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/mem_cache/radix_cache.py +0 -0
  328. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/metrics/func_timer.py +0 -0
  329. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/mm_utils.py +0 -0
  330. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/model_loader/__init__.py +0 -0
  331. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/model_loader/utils.py +0 -0
  332. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/model_parallel.py +0 -0
  333. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/gemma2_reward.py +0 -0
  334. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/internlm2_reward.py +0 -0
  335. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/llama_classification.py +0 -0
  336. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/llama_eagle.py +0 -0
  337. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/llama_embedding.py +0 -0
  338. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/llama_reward.py +0 -0
  339. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/llava.py +0 -0
  340. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/llavavid.py +0 -0
  341. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/mistral.py +0 -0
  342. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/qwen2_eagle.py +0 -0
  343. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/registry.py +0 -0
  344. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/models/yivl.py +0 -0
  345. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  346. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  347. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  348. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  349. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  350. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  351. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/speculative/spec_info.py +0 -0
  352. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  353. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/few_shot_gsm8k.py +0 -0
  354. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  355. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/run_eval.py +0 -0
  356. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/simple_eval_common.py +0 -0
  357. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/simple_eval_gpqa.py +0 -0
  358. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/simple_eval_humaneval.py +0 -0
  359. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/simple_eval_math.py +0 -0
  360. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/simple_eval_mgsm.py +0 -0
  361. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/simple_eval_mmlu.py +0 -0
  362. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  363. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/test_activation.py +0 -0
  364. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/test_block_fp8.py +0 -0
  365. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang/test/test_layernorm.py +0 -0
  366. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang.egg-info/dependency_links.txt +0 -0
  367. {sglang-0.4.1.post6 → sglang-0.4.2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.1.post6
3
+ Version: 0.4.2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -236,13 +236,13 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
236
236
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
237
237
  Requires-Dist: uvicorn; extra == "runtime-common"
238
238
  Requires-Dist: uvloop; extra == "runtime-common"
239
- Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
239
+ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
240
240
  Provides-Extra: srt
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
242
  Requires-Dist: cuda-python; extra == "srt"
243
- Requires-Dist: sgl-kernel>=0.0.2.post12; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
244
244
  Requires-Dist: torch; extra == "srt"
245
- Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
245
+ Requires-Dist: vllm==0.6.4.post1; extra == "srt"
246
246
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
@@ -252,6 +252,9 @@ Provides-Extra: srt-xpu
252
252
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
253
253
  Provides-Extra: srt-hpu
254
254
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
255
+ Provides-Extra: srt-cpu
256
+ Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
257
+ Requires-Dist: torch; extra == "srt-cpu"
255
258
  Provides-Extra: openai
256
259
  Requires-Dist: openai>=1.0; extra == "openai"
257
260
  Requires-Dist: tiktoken; extra == "openai"
@@ -288,6 +291,11 @@ Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
288
291
  Requires-Dist: sglang[openai]; extra == "all-hpu"
289
292
  Requires-Dist: sglang[anthropic]; extra == "all-hpu"
290
293
  Requires-Dist: sglang[litellm]; extra == "all-hpu"
294
+ Provides-Extra: all-cpu
295
+ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
296
+ Requires-Dist: sglang[openai]; extra == "all-cpu"
297
+ Requires-Dist: sglang[anthropic]; extra == "all-cpu"
298
+ Requires-Dist: sglang[litellm]; extra == "all-cpu"
291
299
  Provides-Extra: dev
292
300
  Requires-Dist: sglang[all]; extra == "dev"
293
301
  Requires-Dist: sglang[test]; extra == "dev"
@@ -300,6 +308,9 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
300
308
  Provides-Extra: dev-hpu
301
309
  Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
302
310
  Requires-Dist: sglang[test]; extra == "dev-hpu"
311
+ Provides-Extra: dev-cpu
312
+ Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
313
+ Requires-Dist: sglang[test]; extra == "dev-cpu"
303
314
 
304
315
  <div align="center" id="sglangtop">
305
316
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -322,16 +333,16 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
322
333
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
323
334
 
324
335
  ## News
325
- - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
326
- - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
327
- - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
328
- - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
336
+ - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeekSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
337
+ - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
338
+ - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
339
+ - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
329
340
 
330
341
  <details>
331
342
  <summary>More</summary>
332
343
 
344
+ - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
333
345
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
334
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
335
346
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
336
347
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
337
348
 
@@ -361,7 +372,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
361
372
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
362
373
 
363
374
  ## Adoption and Sponsorship
364
- The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
375
+ The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
365
376
 
366
377
  ## Acknowledgment and Citation
367
378
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -19,16 +19,16 @@
19
19
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
20
20
 
21
21
  ## News
22
- - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
23
- - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
24
- - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
25
- - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
22
+ - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeekSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
23
+ - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
24
+ - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
25
+ - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
26
26
 
27
27
  <details>
28
28
  <summary>More</summary>
29
29
 
30
+ - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
30
31
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
31
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
32
32
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
33
33
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
34
34
 
@@ -58,7 +58,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
58
58
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
59
59
 
60
60
  ## Adoption and Sponsorship
61
- The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
61
+ The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
62
62
 
63
63
  ## Acknowledgment and Citation
64
64
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.1.post6"
7
+ version = "0.4.2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -23,11 +23,11 @@ runtime_common = [
23
23
  "packaging", "pillow", "prometheus-client>=0.20.0",
24
24
  "psutil", "pydantic", "python-multipart",
25
25
  "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
26
- "xgrammar>=0.1.6"
26
+ "xgrammar>=0.1.10"
27
27
  ]
28
28
  srt = [
29
29
  "sglang[runtime_common]", "cuda-python",
30
- "sgl-kernel>=0.0.2.post12", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1",
30
+ "sgl-kernel>=0.0.3", "torch", "vllm==0.6.4.post1",
31
31
  "flashinfer==0.1.6"
32
32
  ]
33
33
 
@@ -40,6 +40,10 @@ srt_xpu = ["sglang[runtime_common]"]
40
40
  #For Intel Gaudi(device : hpu) follow the installation guide
41
41
  #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
42
42
  srt_hpu = ["sglang[runtime_common]"]
43
+ # CPU: currently, there are no pre-built vllm wheels for CPU.
44
+ # To install vllm for CPU, please follow the instruction here:
45
+ # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
46
+ srt_cpu = ["sglang[runtime_common]", "torch"]
43
47
 
44
48
  openai = ["openai>=1.0", "tiktoken"]
45
49
  anthropic = ["anthropic>=0.20.0"]
@@ -57,11 +61,13 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
57
61
  all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
58
62
  all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
59
63
  all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
64
+ all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
60
65
 
61
66
  dev = ["sglang[all]", "sglang[test]"]
62
67
  dev_hip = ["sglang[all_hip]", "sglang[test]"]
63
68
  dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
64
69
  dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
70
+ dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
65
71
 
66
72
  [project.urls]
67
73
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -1,5 +1,6 @@
1
- # SGL API Components
1
+ # SGLang public APIs
2
2
 
3
+ # Frontend Language APIs
3
4
  from sglang.api import (
4
5
  Engine,
5
6
  Runtime,
@@ -23,16 +24,26 @@ from sglang.api import (
23
24
  user_end,
24
25
  video,
25
26
  )
27
+ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
26
28
  from sglang.lang.choices import (
27
29
  greedy_token_selection,
28
30
  token_length_normalized,
29
31
  unconditional_likelihood_normalized,
30
32
  )
33
+ from sglang.utils import LazyImport
34
+
35
+ Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
36
+ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
37
+ OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
38
+ VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
39
+
40
+ # Other configs
41
+ from sglang.global_config import global_config
42
+ from sglang.version import __version__
31
43
 
32
- # SGLang DSL APIs
33
44
  __all__ = [
34
- "Runtime",
35
45
  "Engine",
46
+ "Runtime",
36
47
  "assistant",
37
48
  "assistant_begin",
38
49
  "assistant_end",
@@ -52,27 +63,14 @@ __all__ = [
52
63
  "user_begin",
53
64
  "user_end",
54
65
  "video",
66
+ "RuntimeEndpoint",
55
67
  "greedy_token_selection",
56
68
  "token_length_normalized",
57
69
  "unconditional_likelihood_normalized",
70
+ "Anthropic",
71
+ "LiteLLM",
72
+ "OpenAI",
73
+ "VertexAI",
74
+ "global_config",
75
+ "__version__",
58
76
  ]
59
-
60
- # Global Configurations
61
- from sglang.global_config import global_config
62
-
63
- __all__ += ["global_config"]
64
-
65
- from sglang.version import __version__
66
-
67
- __all__ += ["__version__"]
68
-
69
- # SGLang Backends
70
- from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
71
- from sglang.utils import LazyImport
72
-
73
- Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
74
- LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
75
- OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
76
- VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
77
-
78
- __all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
@@ -1,6 +1,5 @@
1
1
  """Public APIs of the language."""
2
2
 
3
- import os
4
3
  import re
5
4
  from typing import Callable, List, Optional, Union
6
5
 
@@ -33,19 +32,15 @@ def function(
33
32
 
34
33
 
35
34
  def Runtime(*args, **kwargs):
36
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
37
-
38
35
  # Avoid importing unnecessary dependency
39
- from sglang.srt.server import Runtime
36
+ from sglang.lang.backend.runtime_endpoint import Runtime
40
37
 
41
38
  return Runtime(*args, **kwargs)
42
39
 
43
40
 
44
41
  def Engine(*args, **kwargs):
45
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
46
-
47
42
  # Avoid importing unnecessary dependency
48
- from sglang.srt.server import Engine
43
+ from sglang.srt.entrypoints.engine import Engine
49
44
 
50
45
  return Engine(*args, **kwargs)
51
46
 
@@ -27,7 +27,8 @@ from sglang.bench_serving import (
27
27
  sample_random_requests,
28
28
  set_ulimit,
29
29
  )
30
- from sglang.srt.server import Engine, Runtime
30
+ from sglang.lang.backend.runtime_endpoint import Runtime
31
+ from sglang.srt.entrypoints.engine import Engine
31
32
  from sglang.srt.server_args import ServerArgs
32
33
 
33
34
 
@@ -39,20 +40,22 @@ class BenchArgs:
39
40
  dataset_path: str = ""
40
41
  num_prompts: int = 1000
41
42
  sharegpt_output_len: Optional[int] = None
43
+ sharegpt_context_len: Optional[int] = None
42
44
  random_input_len: int = 1024
43
45
  random_output_len: int = 1024
44
46
  random_range_ratio: float = 0.0
45
- gen_num_groups: int = 64
46
- gen_prompts_per_group: int = 16
47
- gen_system_prompt_len: int = 2048
48
- gen_question_len: int = 128
49
- gen_output_len: int = 256
47
+ gsp_num_groups: int = 64
48
+ gsp_prompts_per_group: int = 16
49
+ gsp_system_prompt_len: int = 2048
50
+ gsp_question_len: int = 128
51
+ gsp_output_len: int = 256
52
+ seed: int = 1
50
53
  disable_ignore_eos: bool = False
51
54
  extra_request_body: Optional[str] = None
52
- seed: int = 1
55
+ apply_chat_template: bool = False
56
+ profile: bool = False
53
57
  skip_warmup: bool = False
54
58
  do_not_exit: bool = False
55
- profile: bool = False
56
59
 
57
60
  @staticmethod
58
61
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -82,6 +85,12 @@ class BenchArgs:
82
85
  default=BenchArgs.sharegpt_output_len,
83
86
  help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
84
87
  )
88
+ parser.add_argument(
89
+ "--sharegpt-context-len",
90
+ type=int,
91
+ default=BenchArgs.sharegpt_context_len,
92
+ help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
93
+ )
85
94
  parser.add_argument(
86
95
  "--random-input-len",
87
96
  type=int,
@@ -102,51 +111,62 @@ class BenchArgs:
102
111
  "used only for random dataset.",
103
112
  )
104
113
  parser.add_argument(
105
- "--gen-num-groups",
114
+ "--gsp-num-groups",
106
115
  type=int,
107
- default=BenchArgs.gen_num_groups,
116
+ default=BenchArgs.gsp_num_groups,
108
117
  help="Number of groups with shared prefix, used"
109
118
  "only for generate-shared-prefix",
110
119
  )
111
120
  parser.add_argument(
112
- "--gen-prompts-per-group",
121
+ "--gsp-prompts-per-group",
113
122
  type=int,
114
- default=BenchArgs.gen_prompts_per_group,
123
+ default=BenchArgs.gsp_prompts_per_group,
115
124
  help="Number of prompts per group of shared prefix, used"
116
125
  "only for generate-shared-prefix",
117
126
  )
118
127
  parser.add_argument(
119
- "--gen-system-prompt-len",
128
+ "--gsp-system-prompt-len",
120
129
  type=int,
121
- default=BenchArgs.gen_system_prompt_len,
130
+ default=BenchArgs.gsp_system_prompt_len,
122
131
  help="System prompt length, used" "only for generate-shared-prefix",
123
132
  )
124
133
  parser.add_argument(
125
- "--gen-question-len",
134
+ "--gsp-question-len",
126
135
  type=int,
127
- default=BenchArgs.gen_question_len,
136
+ default=BenchArgs.gsp_question_len,
128
137
  help="Question length, used" "only for generate-shared-prefix",
129
138
  )
130
139
  parser.add_argument(
131
- "--gen-output-len",
140
+ "--gsp-output-len",
132
141
  type=int,
133
- default=BenchArgs.gen_output_len,
142
+ default=BenchArgs.gsp_output_len,
134
143
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
135
144
  )
145
+ parser.add_argument("--seed", type=int, default=1, help="The random seed.")
136
146
  parser.add_argument(
137
147
  "--disable-ignore-eos",
138
- type=bool,
139
- default=BenchArgs.disable_ignore_eos,
148
+ action="store_true",
140
149
  help="Disable ignore EOS token",
141
150
  )
142
151
  parser.add_argument(
143
152
  "--extra-request-body",
144
153
  metavar='{"key1": "value1", "key2": "value2"}',
145
154
  type=str,
155
+ default=BenchArgs.extra_request_body,
146
156
  help="Append given JSON object to the request payload. You can use this to specify"
147
157
  "additional generate params like sampling params.",
148
158
  )
149
- parser.add_argument("--seed", type=int, default=1, help="The random seed.")
159
+ parser.add_argument(
160
+ "--apply-chat-template",
161
+ action="store_true",
162
+ help="Apply chat template",
163
+ )
164
+ parser.add_argument(
165
+ "--profile",
166
+ action="store_true",
167
+ help="Use Torch Profiler. The endpoint must be launched with "
168
+ "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
169
+ )
150
170
  parser.add_argument(
151
171
  "--skip-warmup",
152
172
  action="store_true",
@@ -157,12 +177,6 @@ class BenchArgs:
157
177
  action="store_true",
158
178
  help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
159
179
  )
160
- parser.add_argument(
161
- "--profile",
162
- action="store_true",
163
- help="Use Torch Profiler. The endpoint must be launched with "
164
- "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
165
- )
166
180
 
167
181
  @classmethod
168
182
  def from_cli_args(cls, args: argparse.Namespace):
@@ -9,7 +9,8 @@ It accepts server arguments (the same as launch_server.py) and benchmark argumen
9
9
  python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
10
10
  ## sweep through multiple data points and store (append) the results in a jsonl file:
11
11
  python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
12
-
12
+ ## run with profiling:
13
+ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
13
14
  # Usage (correctness test):
14
15
  python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
15
16
 
@@ -56,15 +57,21 @@ import torch
56
57
  import torch.distributed as dist
57
58
 
58
59
  from sglang.srt.configs.model_config import ModelConfig
60
+ from sglang.srt.entrypoints.engine import _set_envs_and_config
59
61
  from sglang.srt.hf_transformers_utils import get_tokenizer
60
62
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
61
63
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
62
64
  from sglang.srt.model_executor.model_runner import ModelRunner
63
65
  from sglang.srt.sampling.sampling_params import SamplingParams
64
- from sglang.srt.server import _set_envs_and_config
65
66
  from sglang.srt.server_args import PortArgs, ServerArgs
66
67
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
67
- from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
68
+ from sglang.srt.utils import (
69
+ configure_logger,
70
+ get_bool_env_var,
71
+ kill_process_tree,
72
+ set_gpu_proc_affinity,
73
+ suppress_other_loggers,
74
+ )
68
75
 
69
76
 
70
77
  @dataclasses.dataclass
@@ -77,6 +84,8 @@ class BenchArgs:
77
84
  correctness_test: bool = False
78
85
  # This is only used for correctness test
79
86
  cut_len: int = 4
87
+ profile: bool = False
88
+ profile_filename_prefix: str = "profile"
80
89
 
81
90
  @staticmethod
82
91
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -95,6 +104,16 @@ class BenchArgs:
95
104
  )
96
105
  parser.add_argument("--correctness-test", action="store_true")
97
106
  parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
107
+ parser.add_argument(
108
+ "--profile", action="store_true", help="Use Torch Profiler."
109
+ )
110
+ parser.add_argument(
111
+ "--profile-filename-prefix",
112
+ type=str,
113
+ default=BenchArgs.profile_filename_prefix,
114
+ help="Prefix of the profiling file names. The full profiling result file(s) be "
115
+ '"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
116
+ )
98
117
 
99
118
  @classmethod
100
119
  def from_cli_args(cls, args: argparse.Namespace):
@@ -216,6 +235,7 @@ def extend(reqs, model_runner):
216
235
  model_config=model_runner.model_config,
217
236
  enable_overlap=False,
218
237
  spec_algorithm=SpeculativeAlgorithm.NONE,
238
+ enable_custom_logit_processor=False,
219
239
  )
220
240
  batch.prepare_for_extend()
221
241
  model_worker_batch = batch.get_model_worker_batch()
@@ -286,7 +306,16 @@ def synchronize(device):
286
306
 
287
307
 
288
308
  def latency_test_run_once(
289
- run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
309
+ run_name,
310
+ model_runner,
311
+ rank_print,
312
+ reqs,
313
+ batch_size,
314
+ input_len,
315
+ output_len,
316
+ device,
317
+ profile,
318
+ profile_filename_prefix,
290
319
  ):
291
320
  max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
292
321
  if batch_size > max_batch_size:
@@ -308,6 +337,17 @@ def latency_test_run_once(
308
337
 
309
338
  tot_latency = 0
310
339
 
340
+ profiler = None
341
+ if profile:
342
+ profiler = torch.profiler.profile(
343
+ activities=[
344
+ torch.profiler.ProfilerActivity.CPU,
345
+ torch.profiler.ProfilerActivity.CUDA,
346
+ ],
347
+ with_stack=True,
348
+ )
349
+ profiler.start()
350
+
311
351
  # Prefill
312
352
  synchronize(device)
313
353
  tic = time.time()
@@ -338,6 +378,14 @@ def latency_test_run_once(
338
378
  f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
339
379
  )
340
380
 
381
+ if profile:
382
+ profiler.stop()
383
+ profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
384
+ parent_dir = os.path.dirname(os.path.abspath(profile_filename))
385
+ os.makedirs(parent_dir, exist_ok=True)
386
+ profiler.export_chrome_trace(profile_filename)
387
+ rank_print(f"torch profiler chrome trace saved to {profile_filename}")
388
+
341
389
  # Record decode timing from 2nd output
342
390
  if output_len > 1:
343
391
  med_decode_latency = np.median(decode_latencies)
@@ -363,6 +411,10 @@ def latency_test(
363
411
  bench_args,
364
412
  tp_rank,
365
413
  ):
414
+ # Set CPU affinity
415
+ if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
416
+ set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
417
+
366
418
  # Configure the logger
367
419
  configure_logger(server_args, prefix=f" TP{tp_rank}")
368
420
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
@@ -386,6 +438,8 @@ def latency_test(
386
438
  bench_args.input_len[0],
387
439
  8, # shorter decoding to speed up the warmup
388
440
  server_args.device,
441
+ profile=False,
442
+ profile_filename_prefix="", # not used
389
443
  )
390
444
 
391
445
  rank_print("Benchmark ...")
@@ -405,6 +459,8 @@ def latency_test(
405
459
  il,
406
460
  ol,
407
461
  server_args.device,
462
+ bench_args.profile if tp_rank == 0 else None,
463
+ bench_args.profile_filename_prefix,
408
464
  )
409
465
  if ret is not None:
410
466
  result_list.append(ret)
@@ -22,7 +22,7 @@ from typing import Tuple
22
22
  import numpy as np
23
23
  import requests
24
24
 
25
- from sglang.srt.server import launch_server
25
+ from sglang.srt.entrypoints.http_server import launch_server
26
26
  from sglang.srt.server_args import ServerArgs
27
27
  from sglang.srt.utils import kill_process_tree
28
28