sglang 0.4.0.post2__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. {sglang-0.4.0.post2 → sglang-0.4.1}/PKG-INFO +3 -3
  2. {sglang-0.4.0.post2 → sglang-0.4.1}/README.md +1 -1
  3. {sglang-0.4.0.post2 → sglang-0.4.1}/pyproject.toml +3 -3
  4. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/bench_offline_throughput.py +0 -12
  5. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/bench_one_batch.py +0 -12
  6. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/bench_serving.py +1 -0
  7. sglang-0.4.1/sglang/srt/aio_rwlock.py +100 -0
  8. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/configs/model_config.py +8 -1
  9. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/flashinfer_backend.py +49 -5
  10. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/linear.py +20 -2
  11. {sglang-0.4.0.post2/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/ep_moe/layer.py +14 -39
  12. sglang-0.4.1/sglang/srt/layers/moe/fused_moe_native.py +46 -0
  13. {sglang-0.4.0.post2/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/fused_moe_triton/__init__.py +3 -7
  14. {sglang-0.4.0.post2/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/fused_moe_triton/fused_moe.py +110 -98
  15. {sglang-0.4.0.post2/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/fused_moe_triton/layer.py +16 -48
  16. sglang-0.4.1/sglang/srt/layers/moe/topk.py +191 -0
  17. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/quantization/__init__.py +3 -3
  18. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/quantization/fp8.py +169 -32
  19. sglang-0.4.1/sglang/srt/layers/quantization/fp8_kernel.py +278 -0
  20. sglang-0.4.1/sglang/srt/layers/quantization/fp8_utils.py +116 -0
  21. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/torchao_utils.py +11 -15
  22. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/schedule_batch.py +16 -10
  23. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/scheduler.py +2 -2
  24. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/tokenizer_manager.py +86 -76
  25. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/mem_cache/memory_pool.py +15 -8
  26. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/model_executor/cuda_graph_runner.py +1 -1
  27. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/model_executor/model_runner.py +6 -0
  28. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/dbrx.py +1 -1
  29. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/deepseek.py +1 -1
  30. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/deepseek_v2.py +67 -18
  31. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/grok.py +1 -1
  32. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/mixtral.py +2 -2
  33. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/olmoe.py +1 -1
  34. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/qwen2_moe.py +1 -1
  35. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/xverse_moe.py +1 -1
  36. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/openai_api/adapter.py +4 -0
  37. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/server.py +1 -0
  38. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/utils.py +33 -44
  39. sglang-0.4.1/sglang/test/test_block_fp8.py +341 -0
  40. sglang-0.4.1/sglang/version.py +1 -0
  41. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang.egg-info/PKG-INFO +3 -3
  42. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang.egg-info/SOURCES.txt +11 -7
  43. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang.egg-info/requires.txt +1 -1
  44. sglang-0.4.0.post2/sglang/srt/layers/fused_moe_patch.py +0 -133
  45. sglang-0.4.0.post2/sglang/srt/layers/quantization/fp8_utils.py +0 -27
  46. sglang-0.4.0.post2/sglang/version.py +0 -1
  47. {sglang-0.4.0.post2 → sglang-0.4.1}/LICENSE +0 -0
  48. {sglang-0.4.0.post2 → sglang-0.4.1}/setup.cfg +0 -0
  49. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/__init__.py +0 -0
  50. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/api.py +0 -0
  51. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/bench_latency.py +0 -0
  52. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/bench_one_batch_server.py +0 -0
  53. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/check_env.py +0 -0
  54. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/global_config.py +0 -0
  55. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/__init__.py +0 -0
  56. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/backend/__init__.py +0 -0
  57. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/backend/anthropic.py +0 -0
  58. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/backend/base_backend.py +0 -0
  59. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/backend/litellm.py +0 -0
  60. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/backend/openai.py +0 -0
  61. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  62. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/backend/vertexai.py +0 -0
  63. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/chat_template.py +0 -0
  64. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/choices.py +0 -0
  65. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/compiler.py +0 -0
  66. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/interpreter.py +0 -0
  67. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/ir.py +0 -0
  68. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/lang/tracer.py +0 -0
  69. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/launch_server.py +0 -0
  70. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/launch_server_llavavid.py +0 -0
  71. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/llama3_eval.py +0 -0
  72. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/_custom_ops.py +0 -0
  73. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/configs/__init__.py +0 -0
  74. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/configs/device_config.py +0 -0
  75. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/configs/exaone.py +0 -0
  76. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/configs/load_config.py +0 -0
  77. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/configs/qwen2vl.py +0 -0
  78. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/constrained/__init__.py +0 -0
  79. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  80. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/constrained/outlines_backend.py +0 -0
  81. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  82. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  83. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/conversation.py +0 -0
  84. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/__init__.py +0 -0
  85. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/communication_op.py +0 -0
  86. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
  87. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  88. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  89. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  90. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  91. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  92. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  93. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  94. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  95. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/parallel_state.py +0 -0
  96. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/distributed/utils.py +0 -0
  97. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/hf_transformers_utils.py +0 -0
  98. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/activation.py +0 -0
  99. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/__init__.py +0 -0
  100. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  101. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  102. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  103. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  104. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  105. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  106. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  107. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/custom_op_util.py +0 -0
  108. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/layernorm.py +0 -0
  109. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/logits_processor.py +0 -0
  110. {sglang-0.4.0.post2/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/ep_moe/__init__.py +0 -0
  111. {sglang-0.4.0.post2/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/ep_moe/kernels.py +0 -0
  112. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/pooler.py +0 -0
  113. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/quantization/base_config.py +0 -0
  114. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/radix_attention.py +0 -0
  115. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/rotary_embedding.py +0 -0
  116. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/sampler.py +0 -0
  117. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  118. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/lora/lora.py +0 -0
  119. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/lora/lora_config.py +0 -0
  120. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/lora/lora_manager.py +0 -0
  121. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  122. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  123. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/image_processor.py +0 -0
  124. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/io_struct.py +0 -0
  125. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/schedule_policy.py +0 -0
  126. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/session_controller.py +0 -0
  127. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/tp_worker.py +0 -0
  128. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  129. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  130. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  131. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  132. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  133. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/metrics/collector.py +0 -0
  134. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/metrics/func_timer.py +0 -0
  135. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/mm_utils.py +0 -0
  136. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  137. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/model_loader/__init__.py +0 -0
  138. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/model_loader/loader.py +0 -0
  139. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/model_loader/utils.py +0 -0
  140. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/model_loader/weight_utils.py +0 -0
  141. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/model_parallel.py +0 -0
  142. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/baichuan.py +0 -0
  143. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/chatglm.py +0 -0
  144. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/commandr.py +0 -0
  145. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/exaone.py +0 -0
  146. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/gemma.py +0 -0
  147. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/gemma2.py +0 -0
  148. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/gemma2_reward.py +0 -0
  149. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/gpt2.py +0 -0
  150. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/gpt_bigcode.py +0 -0
  151. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/granite.py +0 -0
  152. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/internlm2.py +0 -0
  153. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/internlm2_reward.py +0 -0
  154. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/llama.py +0 -0
  155. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/llama_classification.py +0 -0
  156. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/llama_embedding.py +0 -0
  157. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/llama_reward.py +0 -0
  158. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/llava.py +0 -0
  159. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/llavavid.py +0 -0
  160. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/minicpm.py +0 -0
  161. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/minicpm3.py +0 -0
  162. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/mistral.py +0 -0
  163. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/mixtral_quant.py +0 -0
  164. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/mllama.py +0 -0
  165. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/olmo.py +0 -0
  166. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/olmo2.py +0 -0
  167. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/phi3_small.py +0 -0
  168. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/qwen.py +0 -0
  169. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/qwen2.py +0 -0
  170. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/qwen2_vl.py +0 -0
  171. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/registry.py +0 -0
  172. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/stablelm.py +0 -0
  173. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/torch_native_llama.py +0 -0
  174. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/xverse.py +0 -0
  175. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/models/yivl.py +0 -0
  176. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/openai_api/protocol.py +0 -0
  177. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  178. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  179. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  180. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  181. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  182. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  183. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  184. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/sampling/sampling_params.py +0 -0
  185. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/server_args.py +0 -0
  186. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/few_shot_gsm8k.py +0 -0
  187. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  188. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/run_eval.py +0 -0
  189. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/runners.py +0 -0
  190. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/simple_eval_common.py +0 -0
  191. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/simple_eval_gpqa.py +0 -0
  192. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/simple_eval_humaneval.py +0 -0
  193. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/simple_eval_math.py +0 -0
  194. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/simple_eval_mgsm.py +0 -0
  195. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/simple_eval_mmlu.py +0 -0
  196. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  197. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/test_activation.py +0 -0
  198. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/test_layernorm.py +0 -0
  199. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/test_programs.py +0 -0
  200. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/test/test_utils.py +0 -0
  201. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang/utils.py +0 -0
  202. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang.egg-info/dependency_links.txt +0 -0
  203. {sglang-0.4.0.post2 → sglang-0.4.1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.0.post2
3
+ Version: 0.4.1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -234,7 +234,6 @@ Requires-Dist: pydantic; extra == "runtime-common"
234
234
  Requires-Dist: python-multipart; extra == "runtime-common"
235
235
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
236
236
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
237
- Requires-Dist: gemlite; extra == "runtime-common"
238
237
  Requires-Dist: uvicorn; extra == "runtime-common"
239
238
  Requires-Dist: uvloop; extra == "runtime-common"
240
239
  Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
@@ -244,6 +243,7 @@ Requires-Dist: torch; extra == "srt"
244
243
  Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
245
244
  Requires-Dist: cuda-python; extra == "srt"
246
245
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
246
+ Requires-Dist: sgl-kernel>=0.0.2.post8; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
@@ -358,7 +358,7 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
358
358
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
359
359
 
360
360
  ## Adoption and Sponsorship
361
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
361
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
362
362
 
363
363
  ## Acknowledgment and Citation
364
364
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -57,7 +57,7 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
57
57
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
58
58
 
59
59
  ## Adoption and Sponsorship
60
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
60
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
61
61
 
62
62
  ## Acknowledgment and Citation
63
63
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.0.post2"
7
+ version = "0.4.1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -21,9 +21,9 @@ runtime_common = ["aiohttp", "decord", "fastapi",
21
21
  "orjson", "outlines>=0.0.44,<0.1.0",
22
22
  "packaging", "pillow", "prometheus-client>=0.20.0",
23
23
  "psutil", "pydantic", "python-multipart",
24
- "pyzmq>=25.1.2", "torchao>=0.7.0", "gemlite", "uvicorn", "uvloop",
24
+ "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
25
25
  "xgrammar>=0.1.6"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post8"]
27
27
 
28
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -322,18 +322,6 @@ def throughput_test(
322
322
  )
323
323
  time.sleep(0.5)
324
324
 
325
- try:
326
- import os
327
- import pwd
328
-
329
- from gemlite.core import GemLiteLinearTriton
330
-
331
- GemLiteLinearTriton.cache_config(
332
- f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
333
- )
334
- except ImportError:
335
- pass
336
-
337
325
  logging.info("\nBenchmark...")
338
326
  result = throughput_test_once(
339
327
  backend_name=bench_args.backend,
@@ -386,18 +386,6 @@ def latency_test(
386
386
  server_args.device,
387
387
  )
388
388
 
389
- try:
390
- import os
391
- import pwd
392
-
393
- from gemlite.core import GemLiteLinearTriton
394
-
395
- GemLiteLinearTriton.cache_config(
396
- f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
397
- )
398
- except ImportError:
399
- pass
400
-
401
389
  rank_print("Benchmark ...")
402
390
 
403
391
  # Run the sweep
@@ -924,6 +924,7 @@ async def benchmark(
924
924
  f"are correctly specified. Error: {test_output.error}"
925
925
  )
926
926
  else:
927
+ requests.post(base_url + "/flush_cache")
927
928
  print("Initial test run completed. Starting main benchmark run...")
928
929
 
929
930
  time.sleep(1.5)
@@ -0,0 +1,100 @@
1
+ import asyncio
2
+
3
+
4
+ class RWLock:
5
+ def __init__(self):
6
+ # Protects internal state
7
+ self._lock = asyncio.Lock()
8
+
9
+ # Condition variable used to wait for state changes
10
+ self._cond = asyncio.Condition(self._lock)
11
+
12
+ # Number of readers currently holding the lock
13
+ self._readers = 0
14
+
15
+ # Whether a writer is currently holding the lock
16
+ self._writer_active = False
17
+
18
+ # How many writers are queued waiting for a turn
19
+ self._waiting_writers = 0
20
+
21
+ @property
22
+ def reader_lock(self):
23
+ """
24
+ A context manager for acquiring a shared (reader) lock.
25
+
26
+ Example:
27
+ async with rwlock.reader_lock:
28
+ # read-only access
29
+ """
30
+ return _ReaderLock(self)
31
+
32
+ @property
33
+ def writer_lock(self):
34
+ """
35
+ A context manager for acquiring an exclusive (writer) lock.
36
+
37
+ Example:
38
+ async with rwlock.writer_lock:
39
+ # exclusive access
40
+ """
41
+ return _WriterLock(self)
42
+
43
+ async def acquire_reader(self):
44
+ async with self._lock:
45
+ # Wait until there is no active writer or waiting writer
46
+ # to ensure fairness.
47
+ while self._writer_active or self._waiting_writers > 0:
48
+ await self._cond.wait()
49
+ self._readers += 1
50
+
51
+ async def release_reader(self):
52
+ async with self._lock:
53
+ self._readers -= 1
54
+ # If this was the last reader, wake up anyone waiting
55
+ # (potentially a writer or new readers).
56
+ if self._readers == 0:
57
+ self._cond.notify_all()
58
+
59
+ async def acquire_writer(self):
60
+ async with self._lock:
61
+ # Increment the count of writers waiting
62
+ self._waiting_writers += 1
63
+ try:
64
+ # Wait while either a writer is active or readers are present
65
+ while self._writer_active or self._readers > 0:
66
+ await self._cond.wait()
67
+ self._writer_active = True
68
+ finally:
69
+ # Decrement waiting writers only after we've acquired the writer lock
70
+ self._waiting_writers -= 1
71
+
72
+ async def release_writer(self):
73
+ async with self._lock:
74
+ self._writer_active = False
75
+ # Wake up anyone waiting (readers or writers)
76
+ self._cond.notify_all()
77
+
78
+
79
+ class _ReaderLock:
80
+ def __init__(self, rwlock: RWLock):
81
+ self._rwlock = rwlock
82
+
83
+ async def __aenter__(self):
84
+ await self._rwlock.acquire_reader()
85
+ return self
86
+
87
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
88
+ await self._rwlock.release_reader()
89
+
90
+
91
+ class _WriterLock:
92
+ def __init__(self, rwlock: RWLock):
93
+ self._rwlock = rwlock
94
+
95
+ async def __aenter__(self):
96
+ await self._rwlock.acquire_writer()
97
+ return self
98
+
99
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
100
+ await self._rwlock.release_writer()
@@ -94,7 +94,10 @@ class ModelConfig:
94
94
  )
95
95
 
96
96
  # FIXME: temporary special judge for MLA architecture
97
- if "DeepseekV2ForCausalLM" in self.hf_config.architectures:
97
+ if (
98
+ "DeepseekV2ForCausalLM" in self.hf_config.architectures
99
+ or "DeepseekV3ForCausalLM" in self.hf_config.architectures
100
+ ):
98
101
  self.head_dim = 256
99
102
  self.attention_arch = AttentionArch.MLA
100
103
  self.kv_lora_rank = self.hf_config.kv_lora_rank
@@ -124,8 +127,12 @@ class ModelConfig:
124
127
  self.num_hidden_layers = self.hf_text_config.num_hidden_layers
125
128
  self.vocab_size = self.hf_text_config.vocab_size
126
129
 
130
+ # Veirfy quantization
127
131
  self._verify_quantization()
128
132
 
133
+ # Multimodel attrs
134
+ self.image_token_id = getattr(self.hf_config, "image_token_id", None)
135
+
129
136
  # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
130
137
  def get_total_num_kv_heads(self) -> int:
131
138
  """Returns the total number of KV heads."""
@@ -18,11 +18,7 @@ import triton.language as tl
18
18
  from sglang.global_config import global_config
19
19
  from sglang.srt.layers.attention import AttentionBackend
20
20
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
21
- from sglang.srt.utils import (
22
- get_bool_env_var,
23
- is_flashinfer_available,
24
- should_use_tensor_core,
25
- )
21
+ from sglang.srt.utils import is_flashinfer_available
26
22
 
27
23
  if TYPE_CHECKING:
28
24
  from sglang.srt.layers.radix_attention import RadixAttention
@@ -731,3 +727,51 @@ def create_flashinfer_kv_indices_triton(
731
727
  mask=mask,
732
728
  )
733
729
  tl.store(kv_indices_ptr + kv_indices_offset + offset, data, mask=mask)
730
+
731
+
732
+ def should_use_tensor_core(
733
+ kv_cache_dtype: torch.dtype,
734
+ num_attention_heads: int,
735
+ num_kv_heads: int,
736
+ ) -> bool:
737
+ """
738
+ Determine whether to use tensor cores for attention computation.
739
+
740
+ Args:
741
+ kv_cache_dtype: Data type of the KV cache
742
+ num_attention_heads: Number of attention heads
743
+ num_kv_heads: Number of key/value heads
744
+
745
+ Returns:
746
+ bool: Whether to use tensor cores
747
+ """
748
+ # Try to use environment variable first
749
+ env_override = os.environ.get("SGLANG_FLASHINFER_USE_TENSOR_CORE")
750
+ if env_override is not None:
751
+ return env_override.lower() == "true"
752
+
753
+ # Try to use _grouped_size_compiled_for_decode_kernels if available
754
+ # This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
755
+ try:
756
+ from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
757
+
758
+ if not _grouped_size_compiled_for_decode_kernels(
759
+ num_attention_heads,
760
+ num_kv_heads,
761
+ ):
762
+ return True
763
+ else:
764
+ return False
765
+ except (ImportError, AttributeError):
766
+ pass
767
+
768
+ # Calculate GQA group size
769
+ gqa_group_size = num_attention_heads // num_kv_heads
770
+
771
+ # Determine based on dtype and GQA group size
772
+ if kv_cache_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
773
+ return True
774
+ elif kv_cache_dtype in (torch.float16, torch.half, torch.bfloat16):
775
+ return gqa_group_size > 4
776
+ else:
777
+ return False
@@ -30,6 +30,7 @@ from sglang.srt.layers.quantization.base_config import (
30
30
  QuantizationConfig,
31
31
  QuantizeMethodBase,
32
32
  )
33
+ from sglang.srt.layers.quantization.fp8_utils import BlockQuantScaleParameter
33
34
  from sglang.srt.utils import set_weight_attrs
34
35
 
35
36
  logger = logging.getLogger(__name__)
@@ -628,8 +629,19 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
628
629
  assert loaded_shard_id < len(self.output_sizes)
629
630
 
630
631
  tp_size = get_tensor_model_parallel_world_size()
631
- shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
632
- shard_size = self.output_sizes[loaded_shard_id] // tp_size
632
+
633
+ if isinstance(param, BlockQuantScaleParameter):
634
+ weight_block_size = self.quant_method.quant_config.weight_block_size
635
+ block_n, _ = weight_block_size[0], weight_block_size[1]
636
+ shard_offset = (
637
+ (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n
638
+ ) // tp_size
639
+ shard_size = (
640
+ (self.output_sizes[loaded_shard_id] + block_n - 1) // block_n // tp_size
641
+ )
642
+ else:
643
+ shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
644
+ shard_size = self.output_sizes[loaded_shard_id] // tp_size
633
645
 
634
646
  param.load_merged_column_weight(
635
647
  loaded_weight=loaded_weight,
@@ -795,6 +807,12 @@ class QKVParallelLinear(ColumnParallelLinear):
795
807
  shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
796
808
  shard_size = self._get_shard_size_mapping(loaded_shard_id)
797
809
 
810
+ if isinstance(param, BlockQuantScaleParameter):
811
+ weight_block_size = self.quant_method.quant_config.weight_block_size
812
+ block_n, _ = weight_block_size[0], weight_block_size[1]
813
+ shard_offset = (shard_offset + block_n - 1) // block_n
814
+ shard_size = (shard_size + block_n - 1) // block_n
815
+
798
816
  param.load_qkv_weight(
799
817
  loaded_weight=loaded_weight,
800
818
  num_heads=self.num_kv_head_replicas,
@@ -12,15 +12,15 @@ from vllm.model_executor.custom_op import CustomOp
12
12
  from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
13
13
 
14
14
  from sglang.srt.layers.custom_op_util import register_custom_op
15
- from sglang.srt.layers.ep_moe.kernels import (
15
+ from sglang.srt.layers.moe.ep_moe.kernels import (
16
16
  grouped_gemm_triton,
17
17
  post_reorder_triton_kernel,
18
18
  pre_reorder_triton_kernel,
19
19
  run_moe_ep_preproess,
20
20
  silu_and_mul_triton_kernel,
21
21
  )
22
- from sglang.srt.layers.fused_moe_triton.fused_moe import fused_topk, grouped_topk
23
- from sglang.srt.layers.fused_moe_triton.layer import FusedMoEMethodBase
22
+ from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoEMethodBase
23
+ from sglang.srt.layers.moe.topk import select_experts
24
24
  from sglang.srt.layers.quantization.base_config import (
25
25
  QuantizationConfig,
26
26
  QuantizeMethodBase,
@@ -113,6 +113,7 @@ class EPMoE(torch.nn.Module):
113
113
  quant_config: Optional[QuantizationConfig] = None,
114
114
  tp_size: Optional[int] = None,
115
115
  prefix: str = "",
116
+ correction_bias: Optional[torch.Tensor] = None,
116
117
  ):
117
118
  super().__init__()
118
119
 
@@ -138,6 +139,7 @@ class EPMoE(torch.nn.Module):
138
139
  assert num_expert_group is not None and topk_group is not None
139
140
  self.num_expert_group = num_expert_group
140
141
  self.topk_group = topk_group
142
+ self.correction_bias = correction_bias
141
143
 
142
144
  if quant_config is None:
143
145
  self.quant_method: Optional[QuantizeMethodBase] = UnquantizedEPMoEMethod()
@@ -170,13 +172,15 @@ class EPMoE(torch.nn.Module):
170
172
  hidden_states.device, use_flashinfer=False # TODO: use flashinfer
171
173
  )
172
174
 
173
- topk_weights, topk_ids = self.select_experts(
174
- hidden_states,
175
- router_logits,
176
- self.top_k,
177
- self.renormalize,
178
- self.topk_group,
179
- self.num_expert_group,
175
+ topk_weights, topk_ids = select_experts(
176
+ hidden_states=hidden_states,
177
+ router_logits=router_logits,
178
+ top_k=self.top_k,
179
+ use_grouped_topk=self.use_grouped_topk,
180
+ renormalize=self.renormalize,
181
+ topk_group=self.topk_group,
182
+ num_expert_group=self.num_expert_group,
183
+ correction_bias=self.correction_bias,
180
184
  )
181
185
 
182
186
  reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(
@@ -297,35 +301,6 @@ class EPMoE(torch.nn.Module):
297
301
  )
298
302
  return output
299
303
 
300
- def select_experts(
301
- self,
302
- hidden_states: torch.Tensor,
303
- router_logits: torch.Tensor,
304
- top_k: int,
305
- renormalize: bool,
306
- topk_group: Optional[int] = None,
307
- num_expert_group: Optional[int] = None,
308
- ):
309
- if self.use_grouped_topk:
310
- assert topk_group is not None
311
- assert num_expert_group is not None
312
- topk_weights, topk_ids = grouped_topk(
313
- hidden_states=hidden_states,
314
- gating_output=router_logits,
315
- topk=top_k,
316
- renormalize=renormalize,
317
- num_expert_group=num_expert_group,
318
- topk_group=topk_group,
319
- )
320
- else:
321
- topk_weights, topk_ids = fused_topk(
322
- hidden_states=hidden_states,
323
- gating_output=router_logits,
324
- topk=top_k,
325
- renormalize=renormalize,
326
- )
327
- return topk_weights, topk_ids.to(torch.int32)
328
-
329
304
  @classmethod
330
305
  def make_expert_params_mapping(
331
306
  cls,
@@ -0,0 +1,46 @@
1
+ """
2
+ Torch-native implementation for FusedMoE. This is used for torch.compile.
3
+ It is based on https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/mixtral-moe/model.py#L204
4
+ """
5
+
6
+ from typing import Callable, Optional
7
+
8
+ import torch
9
+ from torch.nn import functional as F
10
+
11
+ from sglang.srt.layers.moe.topk import select_experts
12
+
13
+
14
+ def fused_moe_forward_native(
15
+ layer: torch.nn.Module,
16
+ x: torch.Tensor,
17
+ use_grouped_topk: bool,
18
+ top_k: int,
19
+ router_logits: torch.Tensor,
20
+ renormalize: bool,
21
+ topk_group: Optional[int] = None,
22
+ num_expert_group: Optional[int] = None,
23
+ custom_routing_function: Optional[Callable] = None,
24
+ correction_bias: Optional[torch.Tensor] = None,
25
+ ) -> torch.Tensor:
26
+ topk_weights, topk_ids = select_experts(
27
+ hidden_states=x,
28
+ router_logits=router_logits,
29
+ use_grouped_topk=use_grouped_topk,
30
+ top_k=top_k,
31
+ renormalize=renormalize,
32
+ topk_group=topk_group,
33
+ num_expert_group=num_expert_group,
34
+ custom_routing_function=custom_routing_function,
35
+ correction_bias=correction_bias,
36
+ torch_native=True,
37
+ )
38
+
39
+ w13_weights = layer.w13_weight[topk_ids]
40
+ w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
41
+ w2_weights = layer.w2_weight[topk_ids]
42
+ x1 = torch.einsum("ti,taoi -> tao", x, w1_weights)
43
+ x1 = F.silu(x1)
44
+ x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
45
+ expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
46
+ return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))
@@ -1,14 +1,12 @@
1
1
  from contextlib import contextmanager
2
2
  from typing import Any, Dict, Optional
3
3
 
4
- import sglang.srt.layers.fused_moe_triton.fused_moe # noqa
5
- from sglang.srt.layers.fused_moe_triton.fused_moe import (
4
+ import sglang.srt.layers.moe.fused_moe_triton.fused_moe # noqa
5
+ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
6
6
  fused_experts,
7
- fused_topk,
8
7
  get_config_file_name,
9
- grouped_topk,
10
8
  )
11
- from sglang.srt.layers.fused_moe_triton.layer import (
9
+ from sglang.srt.layers.moe.fused_moe_triton.layer import (
12
10
  FusedMoE,
13
11
  FusedMoEMethodBase,
14
12
  FusedMoeWeightScaleSupported,
@@ -37,8 +35,6 @@ __all__ = [
37
35
  "override_config",
38
36
  "get_config",
39
37
  "fused_moe",
40
- "fused_topk",
41
38
  "fused_experts",
42
39
  "get_config_file_name",
43
- "grouped_topk",
44
40
  ]