sglang 0.4.2__tar.gz → 0.4.2.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. {sglang-0.4.2/sglang.egg-info → sglang-0.4.2.post1}/PKG-INFO +3 -3
  2. {sglang-0.4.2 → sglang-0.4.2.post1}/README.md +2 -2
  3. {sglang-0.4.2 → sglang-0.4.2.post1}/pyproject.toml +1 -1
  4. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
  5. sglang-0.4.2.post1/sglang/srt/layers/attention/vision.py +407 -0
  6. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/fp8.py +7 -0
  7. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/rotary_embedding.py +28 -12
  8. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/sampler.py +5 -2
  9. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/image_processor.py +77 -38
  10. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/scheduler.py +17 -3
  11. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/mem_cache/base_prefix_cache.py +4 -0
  12. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/mem_cache/chunk_cache.py +3 -0
  13. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/mem_cache/radix_cache.py +30 -1
  14. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/minicpmv.py +129 -76
  15. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/mllama.py +16 -56
  16. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/qwen2.py +4 -1
  17. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/qwen2_vl.py +18 -8
  18. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/server_args.py +6 -0
  19. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/utils.py +0 -2
  20. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/utils.py +42 -0
  21. sglang-0.4.2.post1/sglang/version.py +1 -0
  22. {sglang-0.4.2 → sglang-0.4.2.post1/sglang.egg-info}/PKG-INFO +3 -3
  23. sglang-0.4.2/sglang/srt/layers/attention/vision.py +0 -204
  24. sglang-0.4.2/sglang/version.py +0 -1
  25. {sglang-0.4.2 → sglang-0.4.2.post1}/LICENSE +0 -0
  26. {sglang-0.4.2 → sglang-0.4.2.post1}/setup.cfg +0 -0
  27. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/__init__.py +0 -0
  28. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/api.py +0 -0
  29. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/bench_latency.py +0 -0
  30. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/bench_offline_throughput.py +0 -0
  31. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/bench_one_batch.py +0 -0
  32. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/bench_one_batch_server.py +0 -0
  33. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/bench_serving.py +0 -0
  34. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/check_env.py +0 -0
  35. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/global_config.py +0 -0
  36. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/__init__.py +0 -0
  37. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/backend/__init__.py +0 -0
  38. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/backend/anthropic.py +0 -0
  39. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/backend/base_backend.py +0 -0
  40. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/backend/litellm.py +0 -0
  41. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/backend/openai.py +0 -0
  42. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  43. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/backend/vertexai.py +0 -0
  44. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/chat_template.py +0 -0
  45. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/choices.py +0 -0
  46. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/compiler.py +0 -0
  47. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/interpreter.py +0 -0
  48. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/ir.py +0 -0
  49. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/lang/tracer.py +0 -0
  50. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/launch_server.py +0 -0
  51. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/llama3_eval.py +0 -0
  52. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/_custom_ops.py +0 -0
  53. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/aio_rwlock.py +0 -0
  54. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/configs/__init__.py +0 -0
  55. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/configs/chatglm.py +0 -0
  56. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/configs/dbrx.py +0 -0
  57. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/configs/device_config.py +0 -0
  58. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/configs/exaone.py +0 -0
  59. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/configs/load_config.py +0 -0
  60. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/configs/model_config.py +0 -0
  61. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/configs/qwen2vl.py +0 -0
  62. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  63. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
  64. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  65. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  66. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/conversation.py +0 -0
  67. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/__init__.py +0 -0
  68. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/communication_op.py +0 -0
  69. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  70. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  71. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  72. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  73. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  74. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  75. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  76. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  77. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/parallel_state.py +0 -0
  78. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/distributed/utils.py +0 -0
  79. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/entrypoints/engine.py +0 -0
  80. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/entrypoints/http_server.py +0 -0
  81. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/function_call_parser.py +0 -0
  82. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  83. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/activation.py +0 -0
  84. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/attention/__init__.py +0 -0
  85. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  86. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  87. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  88. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  89. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  90. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  91. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  92. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/custom_op_util.py +0 -0
  93. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/dp_attention.py +0 -0
  94. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/layernorm.py +0 -0
  95. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/linear.py +0 -0
  96. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/logits_processor.py +0 -0
  97. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  98. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  99. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  100. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  101. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  102. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  103. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  104. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  105. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  106. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  107. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  108. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  109. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  110. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  111. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  112. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  113. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  114. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  115. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  116. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  117. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  118. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  119. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  120. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  121. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  122. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  123. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  124. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  125. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  126. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  127. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  128. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  129. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  130. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  131. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  132. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  133. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  134. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  135. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  136. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  137. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  138. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  139. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  140. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  141. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  142. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  143. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  144. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  145. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  146. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  147. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  148. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  149. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  150. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  151. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  152. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  153. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  154. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  155. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  156. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  157. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  158. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  159. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  160. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  161. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  162. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  163. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  164. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  165. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  166. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  167. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  168. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  169. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  170. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  171. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  172. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  173. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  174. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  175. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  176. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  177. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  178. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  179. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  180. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  181. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  182. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  183. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  184. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  185. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  186. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  187. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  188. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  189. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  190. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  191. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  192. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  193. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  194. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +0 -0
  195. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +0 -0
  196. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/moe/topk.py +0 -0
  197. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/parameter.py +0 -0
  198. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/pooler.py +0 -0
  199. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
  200. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  201. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  202. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  203. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  204. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  205. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  206. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  207. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  208. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  209. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  210. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  211. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  212. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  213. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  214. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  215. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  216. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  217. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  218. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  219. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  220. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  221. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  222. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  223. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  224. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  225. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  226. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  227. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  228. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  229. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  230. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  231. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  232. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  233. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  234. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  235. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  236. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  237. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  238. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  239. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  240. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  241. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  242. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  243. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  244. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  245. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  246. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  247. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  248. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  249. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  250. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  251. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  252. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
  253. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/radix_attention.py +0 -0
  254. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  255. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  256. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/lora/lora.py +0 -0
  257. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/lora/lora_config.py +0 -0
  258. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/lora/lora_manager.py +0 -0
  259. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/cache_controller.py +0 -0
  260. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/configure_logging.py +0 -0
  261. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  262. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  263. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/io_struct.py +0 -0
  264. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/schedule_batch.py +0 -0
  265. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  266. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/session_controller.py +0 -0
  267. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/tokenizer_manager.py +0 -0
  268. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/tp_worker.py +0 -0
  269. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  270. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/managers/utils.py +0 -0
  271. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  272. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
  273. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/metrics/collector.py +0 -0
  274. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/metrics/func_timer.py +0 -0
  275. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/mm_utils.py +0 -0
  276. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
  277. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  278. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/model_executor/model_runner.py +0 -0
  279. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/model_loader/__init__.py +0 -0
  280. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/model_loader/loader.py +0 -0
  281. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/model_loader/utils.py +0 -0
  282. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/model_loader/weight_utils.py +0 -0
  283. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/model_parallel.py +0 -0
  284. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/baichuan.py +0 -0
  285. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/chatglm.py +0 -0
  286. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/commandr.py +0 -0
  287. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/dbrx.py +0 -0
  288. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/deepseek.py +0 -0
  289. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/deepseek_v2.py +0 -0
  290. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/exaone.py +0 -0
  291. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/gemma.py +0 -0
  292. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/gemma2.py +0 -0
  293. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  294. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/gpt2.py +0 -0
  295. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  296. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/granite.py +0 -0
  297. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/grok.py +0 -0
  298. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/internlm2.py +0 -0
  299. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  300. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/llama.py +0 -0
  301. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/llama_classification.py +0 -0
  302. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/llama_eagle.py +0 -0
  303. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/llama_embedding.py +0 -0
  304. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/llama_reward.py +0 -0
  305. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/llava.py +0 -0
  306. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/llavavid.py +0 -0
  307. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/minicpm.py +0 -0
  308. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/minicpm3.py +0 -0
  309. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/mistral.py +0 -0
  310. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/mixtral.py +0 -0
  311. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  312. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/olmo.py +0 -0
  313. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/olmo2.py +0 -0
  314. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/olmoe.py +0 -0
  315. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/phi3_small.py +0 -0
  316. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/qwen.py +0 -0
  317. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
  318. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  319. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/registry.py +0 -0
  320. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/stablelm.py +0 -0
  321. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  322. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/xverse.py +0 -0
  323. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/xverse_moe.py +0 -0
  324. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/models/yivl.py +0 -0
  325. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/openai_api/adapter.py +0 -0
  326. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/openai_api/protocol.py +0 -0
  327. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  328. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  329. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  330. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  331. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  332. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  333. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  334. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  335. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  336. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/server.py +0 -0
  337. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  338. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/speculative/eagle_utils.py +0 -0
  339. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/speculative/eagle_worker.py +0 -0
  340. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/speculative/spec_info.py +0 -0
  341. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  342. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  343. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  344. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/run_eval.py +0 -0
  345. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/runners.py +0 -0
  346. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/simple_eval_common.py +0 -0
  347. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  348. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  349. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/simple_eval_math.py +0 -0
  350. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  351. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  352. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  353. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/test_activation.py +0 -0
  354. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/test_block_fp8.py +0 -0
  355. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/test_layernorm.py +0 -0
  356. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/test_programs.py +0 -0
  357. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang/test/test_utils.py +0 -0
  358. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang.egg-info/SOURCES.txt +0 -0
  359. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang.egg-info/dependency_links.txt +0 -0
  360. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang.egg-info/requires.txt +0 -0
  361. {sglang-0.4.2 → sglang-0.4.2.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.2
3
+ Version: 0.4.2.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -333,7 +333,7 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
333
333
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
334
334
 
335
335
  ## News
336
- - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeekSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
336
+ - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
337
337
  - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
338
338
  - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
339
339
  - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -372,7 +372,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
372
372
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
373
373
 
374
374
  ## Adoption and Sponsorship
375
- The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
375
+ The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
376
376
 
377
377
  ## Acknowledgment and Citation
378
378
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -19,7 +19,7 @@
19
19
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
20
20
 
21
21
  ## News
22
- - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeekSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
22
+ - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
23
23
  - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
24
24
  - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
25
25
  - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -58,7 +58,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
58
58
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
59
59
 
60
60
  ## Adoption and Sponsorship
61
- The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
61
+ The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
62
62
 
63
63
  ## Acknowledgment and Citation
64
64
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.2"
7
+ version = "0.4.2.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -166,6 +166,12 @@ def _fwd_kernel(
166
166
  def context_attention_fwd(
167
167
  q, k, v, o, b_start_loc, b_seq_len, max_input_len, is_causal=True
168
168
  ):
169
+ """
170
+ q, k, v: [b * s, head, head_dim]
171
+ b_start_loc: [b]
172
+ b_seq_len: [b]
173
+ out: [b * s, head, head_dim]
174
+ """
169
175
  if is_cuda_available and CUDA_CAPABILITY[0] > 8:
170
176
  BLOCK = 128
171
177
  else:
@@ -0,0 +1,407 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ from einops import rearrange, repeat
9
+
10
+ from sglang.srt.distributed import parallel_state
11
+ from sglang.srt.distributed import utils as dist_utils
12
+ from sglang.srt.layers.attention.triton_ops.prefill_attention import (
13
+ context_attention_fwd,
14
+ )
15
+ from sglang.srt.layers.linear import (
16
+ ColumnParallelLinear,
17
+ QKVParallelLinear,
18
+ RowParallelLinear,
19
+ )
20
+ from sglang.srt.layers.quantization import QuantizationConfig
21
+
22
+
23
+ def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
24
+ if not interleaved:
25
+ x1, x2 = x.chunk(2, dim=-1)
26
+ return torch.cat((-x2, x1), dim=-1)
27
+ else:
28
+ x1, x2 = x[..., ::2], x[..., 1::2]
29
+ return rearrange(
30
+ torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
31
+ )
32
+
33
+
34
+ def apply_rotary_emb_torch(
35
+ x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
36
+ ) -> torch.Tensor:
37
+ """
38
+ x: (batch_size, seqlen, nheads, headdim)
39
+ cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
40
+ """
41
+ ro_dim = cos.shape[-1] * 2
42
+ assert ro_dim <= x.shape[-1]
43
+ cos = repeat(
44
+ cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
45
+ )
46
+ sin = repeat(
47
+ sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
48
+ )
49
+ return torch.cat(
50
+ [
51
+ x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
52
+ x[..., ro_dim:],
53
+ ],
54
+ dim=-1,
55
+ )
56
+
57
+
58
+ def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
59
+ t_ = t.float()
60
+ cos = freqs.cos()
61
+ sin = freqs.sin()
62
+ output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
63
+ return output
64
+
65
+
66
+ class VisionAttention(nn.Module):
67
+ r"""
68
+ Multi-headed attention without any cache, mostly used for ViT.
69
+
70
+
71
+ Args:
72
+ use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
73
+ use_context_forward (bool, default to True):
74
+ if ``True``, a flash_attn style attention will be applied
75
+ Otherwise, a full-sequence attention will be applied.
76
+ use_full_precision_softmax (bool, default to False):
77
+ if ``True``, the softmax will be performed in full-precision
78
+ Otherwise, it will be performed in half-precision
79
+
80
+ """
81
+
82
+ def __init__(
83
+ self,
84
+ embed_dim: int,
85
+ num_heads: int,
86
+ projection_size: int,
87
+ use_qkv_parallel: bool,
88
+ quant_config: Optional[QuantizationConfig] = None,
89
+ dropout: float = 0.0,
90
+ use_context_forward: bool = True,
91
+ use_full_precision_softmax: bool = False,
92
+ flatten_batch: bool = False,
93
+ prefix: str = "",
94
+ ):
95
+ super().__init__()
96
+ self.use_context_forward = use_context_forward
97
+ world_size = parallel_state.get_tensor_model_parallel_world_size()
98
+ self.dropout = dropout
99
+ self.head_size = embed_dim // num_heads
100
+ self.hidden_size_per_attention_head = dist_utils.divide(
101
+ projection_size, num_heads
102
+ )
103
+ self.num_attention_heads_per_partition = dist_utils.divide(
104
+ num_heads, world_size
105
+ )
106
+
107
+ if self.use_context_forward:
108
+ self.qkv_backend = VisionTritonAttention()
109
+ else:
110
+ self.qkv_backend = VisionSdpaAttention(
111
+ head_size=self.head_size,
112
+ dropout=dropout,
113
+ flatten_batch=flatten_batch,
114
+ use_full_precision_softmax=use_full_precision_softmax,
115
+ )
116
+
117
+ self.use_qkv_parallel = use_qkv_parallel
118
+ if use_qkv_parallel:
119
+ self.qkv_proj = QKVParallelLinear(
120
+ hidden_size=embed_dim,
121
+ head_size=self.head_size,
122
+ total_num_heads=num_heads,
123
+ quant_config=quant_config,
124
+ prefix=f"{prefix}.qkv_proj",
125
+ )
126
+ else:
127
+ self.qkv_proj = ColumnParallelLinear(
128
+ input_size=embed_dim,
129
+ output_size=3 * projection_size,
130
+ quant_config=quant_config,
131
+ prefix=f"{prefix}.qkv_proj",
132
+ )
133
+ self.proj = RowParallelLinear(
134
+ input_size=embed_dim,
135
+ output_size=embed_dim,
136
+ quant_config=quant_config,
137
+ prefix=f"{prefix}.out_proj",
138
+ )
139
+
140
+ def forward(
141
+ self,
142
+ x: torch.Tensor,
143
+ cu_seqlens: Optional[torch.Tensor] = None,
144
+ rotary_pos_emb: torch.Tensor = None,
145
+ attention_mask: Optional[torch.Tensor] = None,
146
+ ) -> torch.Tensor:
147
+ r"""
148
+ Args:
149
+ x: [b, s, embed_dim]
150
+ cu_seqlens: [b]
151
+ Returns:
152
+ [s, b, num_heads * head]
153
+ """
154
+ bsz, s, _ = x.shape
155
+ if self.use_qkv_parallel:
156
+ # [b, s, embed_dim] --> [b, s, embed_dim]
157
+ qkv, _ = self.qkv_proj(x)
158
+ q, k, v = qkv.chunk(3, dim=-1)
159
+
160
+ # [b, s, embed_dim] --> [b * s, num_heads, head_size]
161
+ q, k, v = [
162
+ x.reshape(
163
+ bsz * s, self.num_attention_heads_per_partition, -1
164
+ ).contiguous()
165
+ for x in (q, k, v)
166
+ ]
167
+ else:
168
+ # [b, s, embed_dim] --> [s, b, embed_dim]
169
+ x = rearrange(x, "b s ... -> s b ...")
170
+ # [s, b, embed_dim] --> [s, b, head * 3 * head_size]
171
+ qkv, _ = self.qkv_proj(x)
172
+ # [s, b, head * 3 * head_size] --> [s, b, head, 3 * head_size]
173
+ new_x_shape = qkv.size()[:-1] + (
174
+ self.num_attention_heads_per_partition,
175
+ 3 * self.hidden_size_per_attention_head,
176
+ )
177
+ qkv = qkv.view(*new_x_shape)
178
+
179
+ # [s, b, head, 3 * head_size] --> 3 [s, b, head, head_size]
180
+ q, k, v = dist_utils.split_tensor_along_last_dim(qkv, 3)
181
+
182
+ # [s, b, head, head_size] --> [b, s, head, head_size]
183
+ q, k, v = [
184
+ rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
185
+ ]
186
+
187
+ if rotary_pos_emb is not None:
188
+ q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
189
+ k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
190
+
191
+ if self.use_qkv_parallel:
192
+ pass
193
+ else:
194
+ # [b, s, head, head_size] --> [b * s, head, head_size]
195
+ q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
196
+
197
+ output = self.qkv_backend.forward(q, k, v, bsz, cu_seqlens, attention_mask)
198
+
199
+ if self.use_qkv_parallel:
200
+ # [b * s, h, head_size] --> [b, s, h * head_size]
201
+ output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
202
+
203
+ # [b, s, h * head_size] --> [b, s, h * head_size]
204
+ output, _ = self.proj(output)
205
+ else:
206
+ # [b * s, h, head_size] --> [s, b, h * head_size]
207
+ context_layer = rearrange(
208
+ output, "(b s) h d -> s b (h d)", b=bsz, s=s
209
+ ).contiguous()
210
+
211
+ # [s, b, h * head_size] --> [s, b, h * head_size]
212
+ output, _ = self.proj(context_layer)
213
+
214
+ # [s, b, h * head_size] --> [b, s, h * head_size]
215
+ output = output.view(bsz, s, -1)
216
+
217
+ return output
218
+
219
+
220
+ class VisionSdpaAttention(nn.Module):
221
+ r"""
222
+ Scaled Dot Product Attention inner product
223
+
224
+ """
225
+
226
+ # TODO: Should it be released after used?
227
+ _mask_cache = {}
228
+
229
+ def __init__(
230
+ self,
231
+ head_size: int,
232
+ dropout: float = 0.0,
233
+ flatten_batch: bool = False,
234
+ use_full_precision_softmax: bool = False,
235
+ ):
236
+ super().__init__()
237
+ self.head_size = head_size
238
+ self.flatten_batch = flatten_batch
239
+ self.use_full_precision_softmax = use_full_precision_softmax
240
+ self.dropout = dropout
241
+
242
+ def generate_patch_attention_mask(
243
+ self,
244
+ s: int,
245
+ bsz: int,
246
+ device,
247
+ cu_seqlens: Optional[torch.Tensor],
248
+ flatten_batch: bool = False,
249
+ dtype=torch.bfloat16,
250
+ ) -> torch.Tensor:
251
+ r"""
252
+ Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
253
+
254
+ When `flatten_batch` is True:
255
+ - All sequences in the batch are flattened into a single dimension
256
+ - `s` represents the total number of tokens across all sequences in the batch
257
+ - Returns a unified mask of shape `(1, 1, s, s)`
258
+
259
+ When `flatten_batch` is False:
260
+ - Each sequence has its own attention mask
261
+ - `s` represents the maximum sequence length in the batch
262
+ - Returns separate masks of shape `(b, 1, s, s)`
263
+
264
+ Args:
265
+ flatten_batch: (bool):
266
+ If True, treats all sequences in the batch as a single flattened sequence
267
+ If False, generates separate masks for each sequence
268
+
269
+ Returns:
270
+ Tensor of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
271
+ """
272
+
273
+ cache_key = (s, bsz, flatten_batch, tuple(cu_seqlens.cpu().tolist()))
274
+
275
+ if cache_key in VisionSdpaAttention._mask_cache:
276
+ cached_mask = VisionSdpaAttention._mask_cache[cache_key]
277
+ # print(f"cache hit for key: {cache_key}")
278
+ return cached_mask.to(device=device, dtype=dtype)
279
+
280
+ if cu_seqlens is None:
281
+ raise ValueError("Internal Error: cu_seqlens cannot be None")
282
+
283
+ if flatten_batch:
284
+ mask = torch.zeros([1, s, s], device=device, dtype=torch.bool)
285
+ for i in range(1, len(cu_seqlens)):
286
+ start = cu_seqlens[i - 1]
287
+ end = cu_seqlens[i]
288
+ mask[
289
+ ...,
290
+ start:end,
291
+ start:end,
292
+ ] = True
293
+ else:
294
+ # [1, 1, 1, s]
295
+ row_indices = torch.arange(s, device=device).view(1, 1, 1, s)
296
+ # [1, 1, s, 1]
297
+ col_indices = torch.arange(s, device=device).view(1, 1, s, 1)
298
+ # [b, 1, 1, 1]
299
+ seq_lens = (
300
+ (cu_seqlens[1:] - cu_seqlens[:-1]).to(device=device).view(-1, 1, 1, 1)
301
+ )
302
+
303
+ mask = (row_indices < seq_lens) & (col_indices < seq_lens)
304
+
305
+ # Convert to attention mask format (False -> 0, True -> -inf)
306
+ mask = (~mask).to(dtype) * torch.finfo(dtype).min
307
+
308
+ VisionSdpaAttention._mask_cache[cache_key] = mask
309
+
310
+ return mask
311
+
312
+ def forward(
313
+ self,
314
+ q: torch.Tensor,
315
+ k: torch.Tensor,
316
+ v: torch.Tensor,
317
+ bsz: int,
318
+ cu_seqlens: Optional[torch.Tensor] = None,
319
+ attention_mask: Optional[torch.Tensor] = None,
320
+ ) -> torch.Tensor:
321
+ r"""
322
+ Args:
323
+ cu_seqlens: [b]
324
+ Returns:
325
+ [b * s, h, head_size]
326
+ """
327
+
328
+ s = q.shape[0] // bsz
329
+
330
+ # [b, 1, s, s]
331
+ if attention_mask is None:
332
+ attention_mask = self.generate_patch_attention_mask(
333
+ s, bsz, q.device, cu_seqlens, self.flatten_batch, q.dtype
334
+ )
335
+ q, k, v = [rearrange(x, "(b s) h d -> b h s d", b=bsz) for x in [q, k, v]]
336
+ # [b, 1, s]
337
+ if self.use_full_precision_softmax:
338
+ scale = self.head_size**-0.5
339
+ k_transposed = rearrange(k, "b h s d -> b h d s")
340
+ attn_weights = torch.matmul(q, k_transposed) * scale
341
+ del k, k_transposed
342
+ attn_weights = attn_weights + attention_mask
343
+ del attention_mask
344
+ # full-precision
345
+ attn_weights = nn.functional.softmax(
346
+ attn_weights, dim=-1, dtype=torch.float32
347
+ ).to(q.dtype)
348
+ attn_weights = nn.functional.dropout(
349
+ attn_weights, p=self.dropout, training=False
350
+ )
351
+ output = torch.matmul(attn_weights, v)
352
+ del attn_weights, v
353
+ else:
354
+ # SDPA
355
+ # [b, h, s, head_size]
356
+ output = F.scaled_dot_product_attention(
357
+ q, k, v, attention_mask, dropout_p=self.dropout
358
+ )
359
+
360
+ # [b, h, s, head_size] --> [b * s, h, head_size]
361
+ output = rearrange(output, "b h s d -> (b s) h d")
362
+
363
+ return output
364
+
365
+
366
+ class VisionTritonAttention(nn.Module):
367
+ """
368
+ Triton-implemented attention without a causal mask
369
+ """
370
+
371
+ def __init__(
372
+ self,
373
+ ):
374
+ super().__init__()
375
+
376
+ def forward(
377
+ self,
378
+ q: torch.Tensor,
379
+ k: torch.Tensor,
380
+ v: torch.Tensor,
381
+ _bsz: int,
382
+ cu_seqlens: Optional[torch.Tensor],
383
+ **kwargs,
384
+ ) -> torch.Tensor:
385
+ r"""
386
+ Args:
387
+ cu_seqlens: [b]
388
+ Returns:
389
+ [b * s, h, head_size]
390
+ """
391
+
392
+ # [b * s, head, head_size]
393
+ output = torch.empty_like(q)
394
+ seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
395
+ max_seqlen = seq_lens.max().item()
396
+ context_attention_fwd(
397
+ q,
398
+ k,
399
+ v,
400
+ output,
401
+ cu_seqlens.cuda(),
402
+ seq_lens.cuda(),
403
+ max_seqlen,
404
+ is_causal=False,
405
+ )
406
+
407
+ return output
@@ -290,6 +290,13 @@ class Fp8LinearMethod(LinearMethodBase):
290
290
  weight_scale, requires_grad=False
291
291
  )
292
292
  layer.input_scale = None
293
+ else:
294
+ layer.weight = torch.nn.Parameter(
295
+ layer.weight.data, requires_grad=False
296
+ )
297
+ layer.weight_scale_inv = torch.nn.Parameter(
298
+ layer.weight_scale_inv.data, requires_grad=False
299
+ )
293
300
  return
294
301
  layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
295
302
  # If checkpoint not serialized fp8, quantize the weights.
@@ -6,9 +6,15 @@ from typing import Any, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  import torch
8
8
  import torch.nn as nn
9
+ from vllm import _custom_ops as ops
9
10
  from vllm.model_executor.custom_op import CustomOp
10
11
 
11
12
  from sglang.srt.layers.custom_op_util import register_custom_op
13
+ from sglang.srt.utils import is_cuda_available
14
+
15
+ _is_cuda_available = is_cuda_available()
16
+ if _is_cuda_available:
17
+ from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
12
18
 
13
19
 
14
20
  def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -75,7 +81,9 @@ class RotaryEmbedding(CustomOp):
75
81
  self.dtype = dtype
76
82
 
77
83
  cache = self._compute_cos_sin_cache()
78
- cache = cache.to(dtype)
84
+ # NOTE(ByronHsu): cache needs to be in FP32 for numerical stability
85
+ if not _is_cuda_available:
86
+ cache = cache.to(dtype)
79
87
  self.cos_sin_cache: torch.Tensor
80
88
  self.register_buffer("cos_sin_cache", cache, persistent=False)
81
89
 
@@ -141,17 +149,25 @@ class RotaryEmbedding(CustomOp):
141
149
  key: torch.Tensor,
142
150
  offsets: Optional[torch.Tensor] = None,
143
151
  ) -> Tuple[torch.Tensor, torch.Tensor]:
144
- from vllm import _custom_ops as ops
145
-
146
- self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
147
- ops.rotary_embedding(
148
- positions,
149
- query,
150
- key,
151
- self.head_size,
152
- self.cos_sin_cache,
153
- self.is_neox_style,
154
- )
152
+ if _is_cuda_available:
153
+ apply_rope_with_cos_sin_cache_inplace(
154
+ positions=positions,
155
+ query=query,
156
+ key=key,
157
+ head_size=self.head_size,
158
+ cos_sin_cache=self.cos_sin_cache,
159
+ is_neox=self.is_neox_style,
160
+ )
161
+ else:
162
+ self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
163
+ ops.rotary_embedding(
164
+ positions,
165
+ query,
166
+ key,
167
+ self.head_size,
168
+ self.cos_sin_cache,
169
+ self.is_neox_style,
170
+ )
155
171
  return query, key
156
172
 
157
173
  def forward_xpu(
@@ -72,9 +72,11 @@ class Sampler(nn.Module):
72
72
  # NOTE: the top_p_renorm_prob from flashinfer has numerical problems,
73
73
  # https://github.com/flashinfer-ai/flashinfer/issues/708
74
74
  # so we use the torch implementation.
75
+
76
+ # clamp to avoid -inf
75
77
  logprobs = torch.log(
76
78
  top_p_normalize_probs_torch(probs, sampling_info.top_ps)
77
- )
79
+ ).clamp(min=torch.finfo(probs.dtype).min)
78
80
 
79
81
  max_top_k_round, batch_size = 32, probs.shape[0]
80
82
  uniform_samples = torch.rand(
@@ -109,9 +111,10 @@ class Sampler(nn.Module):
109
111
  sampling_info.need_min_p_sampling,
110
112
  )
111
113
  if return_logprob:
114
+ # clamp to avoid -inf
112
115
  logprobs = torch.log(
113
116
  top_p_normalize_probs_torch(probs, sampling_info.top_ps)
114
- )
117
+ ).clamp(min=torch.finfo(probs.dtype).min)
115
118
  else:
116
119
  raise ValueError(
117
120
  f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"