sglang 0.4.1.post3__tar.gz → 0.4.1.post4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (336) hide show
  1. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/PKG-INFO +9 -8
  2. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/README.md +6 -5
  3. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/pyproject.toml +3 -3
  4. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/bench_one_batch.py +2 -0
  5. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/attention/__init__.py +14 -5
  6. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -52
  7. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/attention/flashinfer_backend.py +211 -81
  8. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/attention/torch_native_backend.py +1 -38
  9. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/attention/triton_backend.py +20 -11
  10. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/attention/triton_ops/decode_attention.py +4 -0
  11. sglang-0.4.1.post4/sglang/srt/layers/logits_processor.py +346 -0
  12. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  13. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  14. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  15. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  16. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  17. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  18. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  19. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  20. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  21. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  22. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  23. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  24. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  25. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  26. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  27. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  28. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  29. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  30. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  31. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  32. sglang-0.4.1.post4/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  33. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +187 -29
  34. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -6
  35. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/fp8.py +2 -2
  36. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/sampler.py +57 -21
  37. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/torchao_utils.py +17 -3
  38. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/io_struct.py +1 -2
  39. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/schedule_batch.py +26 -2
  40. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/schedule_policy.py +159 -90
  41. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/scheduler.py +62 -26
  42. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/tokenizer_manager.py +22 -20
  43. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/tp_worker.py +16 -4
  44. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  45. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/model_executor/cuda_graph_runner.py +118 -73
  46. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/model_executor/forward_batch_info.py +33 -8
  47. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/model_executor/model_runner.py +63 -61
  48. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/deepseek_v2.py +34 -7
  49. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/grok.py +97 -26
  50. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/openai_api/adapter.py +0 -17
  51. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/sampling/sampling_batch_info.py +21 -0
  52. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/sampling/sampling_params.py +9 -1
  53. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/server.py +9 -5
  54. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/server_args.py +108 -57
  55. sglang-0.4.1.post4/sglang/srt/speculative/build_eagle_tree.py +347 -0
  56. sglang-0.4.1.post4/sglang/srt/speculative/eagle_utils.py +618 -0
  57. sglang-0.4.1.post4/sglang/srt/speculative/eagle_worker.py +170 -0
  58. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/speculative/spec_info.py +5 -0
  59. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/utils.py +15 -2
  60. sglang-0.4.1.post4/sglang/version.py +1 -0
  61. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang.egg-info/PKG-INFO +9 -8
  62. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang.egg-info/SOURCES.txt +24 -0
  63. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang.egg-info/requires.txt +2 -2
  64. sglang-0.4.1.post3/sglang/srt/layers/logits_processor.py +0 -391
  65. sglang-0.4.1.post3/sglang/version.py +0 -1
  66. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/LICENSE +0 -0
  67. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/setup.cfg +0 -0
  68. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/__init__.py +0 -0
  69. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/api.py +0 -0
  70. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/bench_latency.py +0 -0
  71. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/bench_offline_throughput.py +0 -0
  72. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/bench_one_batch_server.py +0 -0
  73. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/bench_serving.py +0 -0
  74. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/check_env.py +0 -0
  75. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/global_config.py +0 -0
  76. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/__init__.py +0 -0
  77. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/backend/__init__.py +0 -0
  78. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/backend/anthropic.py +0 -0
  79. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/backend/base_backend.py +0 -0
  80. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/backend/litellm.py +0 -0
  81. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/backend/openai.py +0 -0
  82. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/backend/runtime_endpoint.py +0 -0
  83. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/backend/vertexai.py +0 -0
  84. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/chat_template.py +0 -0
  85. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/choices.py +0 -0
  86. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/compiler.py +0 -0
  87. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/interpreter.py +0 -0
  88. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/ir.py +0 -0
  89. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/lang/tracer.py +0 -0
  90. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/launch_server.py +0 -0
  91. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/launch_server_llavavid.py +0 -0
  92. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/llama3_eval.py +0 -0
  93. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/_custom_ops.py +0 -0
  94. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/aio_rwlock.py +0 -0
  95. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/configs/__init__.py +0 -0
  96. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/configs/device_config.py +0 -0
  97. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/configs/exaone.py +0 -0
  98. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/configs/load_config.py +0 -0
  99. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/configs/model_config.py +0 -0
  100. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/configs/qwen2vl.py +0 -0
  101. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/constrained/__init__.py +0 -0
  102. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  103. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/constrained/outlines_backend.py +0 -0
  104. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  105. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  106. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/conversation.py +0 -0
  107. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/__init__.py +0 -0
  108. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/communication_op.py +0 -0
  109. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
  110. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  111. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  112. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  113. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  114. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  115. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  116. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  117. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  118. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/parallel_state.py +0 -0
  119. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/distributed/utils.py +0 -0
  120. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/hf_transformers_utils.py +0 -0
  121. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/activation.py +0 -0
  122. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  123. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  124. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  125. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/custom_op_util.py +0 -0
  126. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/layernorm.py +0 -0
  127. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/linear.py +0 -0
  128. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  129. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  130. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  131. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  132. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  133. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  134. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  135. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  136. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  137. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  138. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  139. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  140. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  141. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  142. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  143. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  144. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  145. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  146. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  147. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  148. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  149. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  150. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  151. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  152. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  153. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  154. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  155. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  156. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  157. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  158. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  159. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  160. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  161. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  162. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  163. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  164. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  165. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  166. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  167. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  168. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  169. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  170. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  171. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  172. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  173. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  174. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  175. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  176. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  177. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  178. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  179. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  180. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  181. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  182. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  183. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  184. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  185. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  186. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  187. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  188. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  189. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  190. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  191. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  192. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  193. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  194. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  195. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  196. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  197. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  198. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  199. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  200. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  201. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  202. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  203. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/moe/topk.py +0 -0
  204. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/pooler.py +0 -0
  205. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/__init__.py +0 -0
  206. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/base_config.py +0 -0
  207. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  208. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  209. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  210. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  211. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  212. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  213. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  214. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  215. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  216. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  217. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  218. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  219. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  220. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  221. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  222. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  223. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  224. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  225. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  226. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  227. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  228. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  229. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  230. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  231. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  232. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  233. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  234. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  235. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  236. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  237. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  238. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  239. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  240. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  241. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  242. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  243. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  244. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  245. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  246. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  247. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/radix_attention.py +0 -0
  248. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/rotary_embedding.py +0 -0
  249. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  250. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/lora/lora.py +0 -0
  251. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/lora/lora_config.py +0 -0
  252. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/lora/lora_manager.py +0 -0
  253. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/data_parallel_controller.py +0 -0
  254. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/detokenizer_manager.py +0 -0
  255. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/image_processor.py +0 -0
  256. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/managers/session_controller.py +0 -0
  257. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  258. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  259. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/mem_cache/flush_cache.py +0 -0
  260. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/mem_cache/memory_pool.py +0 -0
  261. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/mem_cache/radix_cache.py +0 -0
  262. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/metrics/collector.py +0 -0
  263. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/metrics/func_timer.py +0 -0
  264. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/mm_utils.py +0 -0
  265. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/model_loader/__init__.py +0 -0
  266. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/model_loader/loader.py +0 -0
  267. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/model_loader/utils.py +0 -0
  268. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/model_loader/weight_utils.py +0 -0
  269. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/model_parallel.py +0 -0
  270. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/baichuan.py +0 -0
  271. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/chatglm.py +0 -0
  272. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/commandr.py +0 -0
  273. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/dbrx.py +0 -0
  274. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/deepseek.py +0 -0
  275. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/exaone.py +0 -0
  276. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/gemma.py +0 -0
  277. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/gemma2.py +0 -0
  278. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/gemma2_reward.py +0 -0
  279. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/gpt2.py +0 -0
  280. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/gpt_bigcode.py +0 -0
  281. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/granite.py +0 -0
  282. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/internlm2.py +0 -0
  283. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/internlm2_reward.py +0 -0
  284. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/llama.py +0 -0
  285. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/llama_classification.py +0 -0
  286. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/llama_eagle.py +0 -0
  287. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/llama_embedding.py +0 -0
  288. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/llama_reward.py +0 -0
  289. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/llava.py +0 -0
  290. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/llavavid.py +0 -0
  291. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/minicpm.py +0 -0
  292. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/minicpm3.py +0 -0
  293. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/mistral.py +0 -0
  294. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/mixtral.py +0 -0
  295. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/mixtral_quant.py +0 -0
  296. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/mllama.py +0 -0
  297. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/olmo.py +0 -0
  298. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/olmo2.py +0 -0
  299. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/olmoe.py +0 -0
  300. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/phi3_small.py +0 -0
  301. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/qwen.py +0 -0
  302. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/qwen2.py +0 -0
  303. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/qwen2_moe.py +0 -0
  304. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/qwen2_vl.py +0 -0
  305. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/registry.py +0 -0
  306. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/stablelm.py +0 -0
  307. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/torch_native_llama.py +0 -0
  308. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/xverse.py +0 -0
  309. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/xverse_moe.py +0 -0
  310. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/models/yivl.py +0 -0
  311. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/openai_api/protocol.py +3 -3
  312. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  313. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  314. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  315. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  316. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  317. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  318. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/few_shot_gsm8k.py +0 -0
  319. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  320. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/run_eval.py +0 -0
  321. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/runners.py +0 -0
  322. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/simple_eval_common.py +0 -0
  323. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/simple_eval_gpqa.py +0 -0
  324. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/simple_eval_humaneval.py +0 -0
  325. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/simple_eval_math.py +0 -0
  326. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/simple_eval_mgsm.py +0 -0
  327. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/simple_eval_mmlu.py +0 -0
  328. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  329. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/test_activation.py +0 -0
  330. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/test_block_fp8.py +0 -0
  331. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/test_layernorm.py +0 -0
  332. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/test_programs.py +0 -0
  333. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/test/test_utils.py +0 -0
  334. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang/utils.py +0 -0
  335. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang.egg-info/dependency_links.txt +0 -0
  336. {sglang-0.4.1.post3 → sglang-0.4.1.post4}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.1.post3
3
+ Version: 0.4.1.post4
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -243,11 +243,11 @@ Requires-Dist: torch; extra == "srt"
243
243
  Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
244
244
  Requires-Dist: cuda-python; extra == "srt"
245
245
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
246
- Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
246
+ Requires-Dist: sgl-kernel>=0.0.2.post11; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
250
- Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
250
+ Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
251
251
  Provides-Extra: srt-xpu
252
252
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
253
253
  Provides-Extra: srt-hpu
@@ -315,7 +315,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
315
315
 
316
316
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
317
317
  | [**Documentation**](https://sgl-project.github.io/)
318
- | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
318
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
319
319
  | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
320
320
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
321
321
 
@@ -347,9 +347,10 @@ The core features include:
347
347
 
348
348
  ## Getting Started
349
349
  - [Install SGLang](https://sgl-project.github.io/start/install.html)
350
- - [Send requests](https://sgl-project.github.io/start/send_request.html)
351
- - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
352
- - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
350
+ - [Quick Start](https://sgl-project.github.io/start/send_request.html)
351
+ - [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
352
+ - [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
353
+ - [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
353
354
 
354
355
  ## Benchmark and Performance
355
356
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
@@ -361,5 +362,5 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
361
362
  The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
362
363
 
363
364
  ## Acknowledgment and Citation
364
- We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
365
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
365
366
  Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -14,7 +14,7 @@
14
14
 
15
15
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
16
16
  | [**Documentation**](https://sgl-project.github.io/)
17
- | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
17
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
18
18
  | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
19
19
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
20
20
 
@@ -46,9 +46,10 @@ The core features include:
46
46
 
47
47
  ## Getting Started
48
48
  - [Install SGLang](https://sgl-project.github.io/start/install.html)
49
- - [Send requests](https://sgl-project.github.io/start/send_request.html)
50
- - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
51
- - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
49
+ - [Quick Start](https://sgl-project.github.io/start/send_request.html)
50
+ - [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
51
+ - [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
52
+ - [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
52
53
 
53
54
  ## Benchmark and Performance
54
55
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
@@ -60,5 +61,5 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
60
61
  The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
61
62
 
62
63
  ## Acknowledgment and Citation
63
- We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
64
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
64
65
  Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.1.post3"
7
+ version = "0.4.1.post4"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -23,11 +23,11 @@ runtime_common = ["aiohttp", "decord", "fastapi",
23
23
  "psutil", "pydantic", "python-multipart",
24
24
  "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
25
25
  "xgrammar>=0.1.6"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post10"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post11"]
27
27
 
28
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
30
- srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
30
+ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post2.dev1"]
31
31
  # xpu is not enabled in public vllm and torch whl,
32
32
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
33
33
  srt_xpu = ["sglang[runtime_common]"]
@@ -63,6 +63,7 @@ from sglang.srt.model_executor.model_runner import ModelRunner
63
63
  from sglang.srt.sampling.sampling_params import SamplingParams
64
64
  from sglang.srt.server import _set_envs_and_config
65
65
  from sglang.srt.server_args import PortArgs, ServerArgs
66
+ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
66
67
  from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
67
68
 
68
69
 
@@ -214,6 +215,7 @@ def extend(reqs, model_runner):
214
215
  tree_cache=None,
215
216
  model_config=model_runner.model_config,
216
217
  enable_overlap=False,
218
+ spec_algorithm=SpeculativeAlgorithm.NONE,
217
219
  )
218
220
  batch.prepare_for_extend()
219
221
  model_worker_batch = batch.get_model_worker_batch()
@@ -1,10 +1,14 @@
1
+ from __future__ import annotations
2
+
1
3
  from abc import ABC, abstractmethod
2
- from typing import Optional
4
+ from typing import TYPE_CHECKING, Optional
3
5
 
4
6
  import torch
5
7
 
6
- from sglang.srt.layers.radix_attention import RadixAttention
7
- from sglang.srt.model_executor.forward_batch_info import ForwardBatch
8
+ if TYPE_CHECKING:
9
+ from sglang.srt.layers.radix_attention import RadixAttention
10
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
11
+ from sglang.srt.speculative.spec_info import SpecInfo
8
12
 
9
13
 
10
14
  class AttentionBackend(ABC):
@@ -22,9 +26,12 @@ class AttentionBackend(ABC):
22
26
  def init_forward_metadata_capture_cuda_graph(
23
27
  self,
24
28
  bs: int,
29
+ num_tokens: int,
25
30
  req_pool_indices: torch.Tensor,
26
31
  seq_lens: torch.Tensor,
27
- encoder_lens: Optional[torch.Tensor] = None,
32
+ encoder_lens: Optional[torch.Tensor],
33
+ forward_mode: ForwardMode,
34
+ spec_info: Optional[SpecInfo],
28
35
  ):
29
36
  """Init the metadata for a forward pass for capturing a cuda graph."""
30
37
  raise NotImplementedError()
@@ -35,7 +42,9 @@ class AttentionBackend(ABC):
35
42
  req_pool_indices: torch.Tensor,
36
43
  seq_lens: torch.Tensor,
37
44
  seq_lens_sum: int,
38
- encoder_lens: Optional[torch.Tensor] = None,
45
+ encoder_lens: Optional[torch.Tensor],
46
+ forward_mode: ForwardMode,
47
+ spec_info: Optional[SpecInfo],
39
48
  ):
40
49
  """Init the metadata for a forward pass for replying a cuda graph."""
41
50
  raise NotImplementedError()
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  import torch
6
- import torch.nn as nn
7
6
 
8
7
  from sglang.srt.layers.attention import AttentionBackend
9
8
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -52,8 +51,6 @@ class DoubleSparseAttnBackend(AttentionBackend):
52
51
 
53
52
  self.forward_metadata = None
54
53
 
55
- self.cuda_graph_max_seq_len = model_runner.model_config.context_len
56
-
57
54
  def init_forward_metadata(self, forward_batch: ForwardBatch):
58
55
  """Init auxiliary variables for triton attention backend."""
59
56
 
@@ -115,55 +112,6 @@ class DoubleSparseAttnBackend(AttentionBackend):
115
112
  ds_req_to_token,
116
113
  )
117
114
 
118
- def init_cuda_graph_state(self, max_bs: int):
119
- # TODO(Andy): Support CUDA graph for double sparse attention
120
- raise ValueError(
121
- "Double sparse attention does not support CUDA graph for now. Please --disable-cuda-graph"
122
- )
123
- self.cuda_graph_max_total_num_tokens = max_bs * self.cuda_graph_max_seq_len
124
-
125
- self.cuda_graph_start_loc = torch.zeros(
126
- (max_bs,), dtype=torch.int32, device="cuda"
127
- )
128
- self.cuda_graph_attn_logits = torch.empty(
129
- (
130
- self.num_head,
131
- self.cuda_graph_max_total_num_tokens,
132
- ),
133
- dtype=self.reduce_dtype,
134
- device="cuda",
135
- )
136
-
137
- def init_forward_metadata_capture_cuda_graph(
138
- self,
139
- bs: int,
140
- req_pool_indices: torch.Tensor,
141
- seq_lens: torch.Tensor,
142
- encoder_lens=None,
143
- ):
144
- # NOTE: encoder_lens expected to be zeros or None
145
- self.forward_metadata = (
146
- self.cuda_graph_start_loc,
147
- self.cuda_graph_attn_logits,
148
- self.cuda_graph_max_seq_len,
149
- None,
150
- )
151
-
152
- def init_forward_metadata_replay_cuda_graph(
153
- self,
154
- bs: int,
155
- req_pool_indices: torch.Tensor,
156
- seq_lens: torch.Tensor,
157
- seq_lens_sum: int,
158
- encoder_lens=None,
159
- ):
160
- # NOTE: encoder_lens expected to be zeros or None
161
- self.cuda_graph_start_loc.zero_()
162
- self.cuda_graph_start_loc[1:bs] = torch.cumsum(seq_lens[: bs - 1], dim=0)
163
-
164
- def get_cuda_graph_seq_len_fill_value(self):
165
- return 1
166
-
167
115
  def forward_extend(
168
116
  self,
169
117
  q,