sglang 0.3.6.post2__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. {sglang-0.3.6.post2 → sglang-0.4.0}/PKG-INFO +2 -1
  2. {sglang-0.3.6.post2 → sglang-0.4.0}/pyproject.toml +2 -2
  3. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_offline_throughput.py +55 -2
  4. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_one_batch.py +7 -6
  5. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_one_batch_server.py +4 -3
  6. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_serving.py +13 -0
  7. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/check_env.py +1 -1
  8. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/launch_server.py +3 -2
  9. sglang-0.4.0/sglang/srt/_custom_ops.py +118 -0
  10. sglang-0.4.0/sglang/srt/configs/device_config.py +17 -0
  11. sglang-0.4.0/sglang/srt/configs/load_config.py +84 -0
  12. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/configs/model_config.py +161 -4
  13. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/configs/qwen2vl.py +5 -8
  14. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/outlines_backend.py +6 -1
  15. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/outlines_jump_forward.py +8 -1
  16. sglang-0.4.0/sglang/srt/distributed/__init__.py +3 -0
  17. sglang-0.4.0/sglang/srt/distributed/communication_op.py +34 -0
  18. sglang-0.4.0/sglang/srt/distributed/device_communicators/__init__.py +0 -0
  19. sglang-0.4.0/sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
  20. sglang-0.4.0/sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
  21. sglang-0.4.0/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
  22. sglang-0.4.0/sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
  23. sglang-0.4.0/sglang/srt/distributed/device_communicators/pynccl.py +204 -0
  24. sglang-0.4.0/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
  25. sglang-0.4.0/sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
  26. sglang-0.4.0/sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
  27. sglang-0.4.0/sglang/srt/distributed/parallel_state.py +1275 -0
  28. sglang-0.4.0/sglang/srt/distributed/utils.py +223 -0
  29. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/hf_transformers_utils.py +37 -1
  30. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/flashinfer_backend.py +13 -15
  31. sglang-0.4.0/sglang/srt/layers/attention/torch_native_backend.py +285 -0
  32. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/fused_moe_patch.py +20 -11
  33. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/linear.py +1 -0
  34. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/logits_processor.py +17 -3
  35. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/quantization/__init__.py +34 -0
  36. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/vocab_parallel_embedding.py +1 -0
  37. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/lora/lora.py +1 -1
  38. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/data_parallel_controller.py +7 -11
  39. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/detokenizer_manager.py +7 -4
  40. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/image_processor.py +1 -1
  41. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/io_struct.py +48 -12
  42. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/schedule_batch.py +42 -36
  43. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/schedule_policy.py +7 -4
  44. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/scheduler.py +111 -46
  45. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/session_controller.py +0 -3
  46. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/tokenizer_manager.py +169 -100
  47. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/tp_worker.py +36 -3
  48. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/managers/tp_worker_overlap_thread.py +32 -5
  49. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/model_executor/cuda_graph_runner.py +16 -7
  50. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/model_executor/forward_batch_info.py +9 -4
  51. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/model_executor/model_runner.py +136 -150
  52. sglang-0.4.0/sglang/srt/model_loader/__init__.py +34 -0
  53. sglang-0.4.0/sglang/srt/model_loader/loader.py +1139 -0
  54. sglang-0.4.0/sglang/srt/model_loader/utils.py +41 -0
  55. sglang-0.4.0/sglang/srt/model_loader/weight_utils.py +640 -0
  56. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/baichuan.py +9 -10
  57. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/chatglm.py +6 -15
  58. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/commandr.py +2 -3
  59. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/dbrx.py +2 -3
  60. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/deepseek.py +4 -11
  61. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/deepseek_v2.py +3 -11
  62. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/exaone.py +2 -3
  63. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gemma.py +2 -6
  64. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gemma2.py +3 -14
  65. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gemma2_reward.py +0 -1
  66. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gpt2.py +5 -12
  67. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/gpt_bigcode.py +6 -22
  68. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/grok.py +14 -51
  69. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/internlm2.py +2 -3
  70. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/internlm2_reward.py +0 -1
  71. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llama.py +97 -27
  72. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llama_classification.py +1 -2
  73. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llama_embedding.py +1 -2
  74. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llama_reward.py +2 -3
  75. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llava.py +10 -12
  76. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/llavavid.py +1 -2
  77. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/minicpm.py +4 -7
  78. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/minicpm3.py +6 -19
  79. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/mixtral.py +12 -5
  80. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/mixtral_quant.py +2 -3
  81. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/mllama.py +3 -7
  82. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/olmo.py +2 -8
  83. sglang-0.4.0/sglang/srt/models/olmo2.py +391 -0
  84. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/olmoe.py +3 -5
  85. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/phi3_small.py +8 -8
  86. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/qwen.py +2 -3
  87. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/qwen2.py +10 -9
  88. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/qwen2_moe.py +4 -11
  89. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/qwen2_vl.py +12 -9
  90. sglang-0.4.0/sglang/srt/models/registry.py +99 -0
  91. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/stablelm.py +2 -3
  92. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/torch_native_llama.py +6 -12
  93. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/xverse.py +2 -4
  94. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/xverse_moe.py +4 -11
  95. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/yivl.py +2 -3
  96. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/openai_api/adapter.py +10 -6
  97. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/openai_api/protocol.py +1 -0
  98. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/server.py +303 -204
  99. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/server_args.py +65 -31
  100. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/utils.py +253 -48
  101. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/test_utils.py +27 -7
  102. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/utils.py +2 -2
  103. sglang-0.4.0/sglang/version.py +1 -0
  104. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/PKG-INFO +2 -1
  105. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/SOURCES.txt +23 -3
  106. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/requires.txt +1 -0
  107. sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
  108. sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
  109. sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok/layer.py +0 -630
  110. sglang-0.3.6.post2/sglang/version.py +0 -1
  111. {sglang-0.3.6.post2 → sglang-0.4.0}/LICENSE +0 -0
  112. {sglang-0.3.6.post2 → sglang-0.4.0}/README.md +0 -0
  113. {sglang-0.3.6.post2 → sglang-0.4.0}/setup.cfg +0 -0
  114. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/__init__.py +0 -0
  115. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/api.py +0 -0
  116. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_latency.py +0 -0
  117. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/global_config.py +0 -0
  118. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/__init__.py +0 -0
  119. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/__init__.py +0 -0
  120. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/anthropic.py +0 -0
  121. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/base_backend.py +0 -0
  122. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/litellm.py +0 -0
  123. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/openai.py +0 -0
  124. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/runtime_endpoint.py +0 -0
  125. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/backend/vertexai.py +0 -0
  126. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/chat_template.py +0 -0
  127. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/choices.py +0 -0
  128. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/compiler.py +0 -0
  129. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/interpreter.py +0 -0
  130. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/ir.py +0 -0
  131. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/lang/tracer.py +0 -0
  132. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/launch_server_llavavid.py +0 -0
  133. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/configs/__init__.py +0 -0
  134. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/configs/exaone.py +0 -0
  135. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/__init__.py +0 -0
  136. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  137. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  138. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/conversation.py +0 -0
  139. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/activation.py +0 -0
  140. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/__init__.py +0 -0
  141. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  142. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_backend.py +0 -0
  143. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  144. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  145. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  146. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  147. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/custom_op_util.py +0 -0
  148. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/fused_moe_triton/__init__.py +0 -0
  149. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/fused_moe_triton/fused_moe.py +0 -0
  150. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/fused_moe_triton/layer.py +0 -0
  151. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/layernorm.py +0 -0
  152. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/pooler.py +0 -0
  153. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/quantization/base_config.py +0 -0
  154. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/radix_attention.py +0 -0
  155. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/rotary_embedding.py +0 -0
  156. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/sampler.py +0 -0
  157. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/layers/torchao_utils.py +0 -0
  158. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/lora/lora_config.py +0 -0
  159. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/lora/lora_manager.py +0 -0
  160. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  161. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  162. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/flush_cache.py +0 -0
  163. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/memory_pool.py +0 -0
  164. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mem_cache/radix_cache.py +0 -0
  165. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/metrics/collector.py +0 -0
  166. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/metrics/func_timer.py +0 -0
  167. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/mm_utils.py +0 -0
  168. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/model_parallel.py +0 -0
  169. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/models/mistral.py +0 -0
  170. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  171. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  172. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  173. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  174. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  175. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  176. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  177. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/srt/sampling/sampling_params.py +0 -0
  178. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/few_shot_gsm8k.py +0 -0
  179. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  180. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/run_eval.py +0 -0
  181. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/runners.py +0 -0
  182. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_common.py +0 -0
  183. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_gpqa.py +0 -0
  184. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_humaneval.py +0 -0
  185. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_math.py +0 -0
  186. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_mgsm.py +0 -0
  187. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/simple_eval_mmlu.py +0 -0
  188. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  189. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/test_activation.py +0 -0
  190. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/test_layernorm.py +0 -0
  191. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang/test/test_programs.py +0 -0
  192. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/dependency_links.txt +0 -0
  193. {sglang-0.3.6.post2 → sglang-0.4.0}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.6.post2
3
+ Version: 0.4.0
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -241,6 +241,7 @@ Requires-Dist: sglang[runtime_common]; extra == "srt"
241
241
  Requires-Dist: torch; extra == "srt"
242
242
  Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
243
243
  Requires-Dist: cuda-python; extra == "srt"
244
+ Requires-Dist: flashinfer>=0.1.6; extra == "srt"
244
245
  Provides-Extra: srt-hip
245
246
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
246
247
  Requires-Dist: torch; extra == "srt-hip"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.6.post2"
7
+ version = "0.4.0"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
23
23
  "psutil", "pydantic", "python-multipart",
24
24
  "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
25
25
  "xgrammar>=0.1.4"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python", "flashinfer>=0.1.6"]
27
27
 
28
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -14,20 +14,20 @@ import argparse
14
14
  import dataclasses
15
15
  import json
16
16
  import logging
17
+ import os
17
18
  import random
18
19
  import time
19
20
  from typing import Dict, List, Optional, Tuple
20
21
 
21
22
  import numpy as np
22
23
 
23
- from sglang.api import Engine
24
24
  from sglang.bench_serving import (
25
25
  get_dataset,
26
26
  get_tokenizer,
27
27
  sample_random_requests,
28
28
  set_ulimit,
29
29
  )
30
- from sglang.srt.server import Runtime
30
+ from sglang.srt.server import Engine, Runtime
31
31
  from sglang.srt.server_args import ServerArgs
32
32
 
33
33
 
@@ -52,6 +52,7 @@ class BenchArgs:
52
52
  seed: int = 1
53
53
  skip_warmup: bool = False
54
54
  do_not_exit: bool = False
55
+ profile: bool = False
55
56
 
56
57
  @staticmethod
57
58
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -156,6 +157,12 @@ class BenchArgs:
156
157
  action="store_true",
157
158
  help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
158
159
  )
160
+ parser.add_argument(
161
+ "--profile",
162
+ action="store_true",
163
+ help="Use Torch Profiler. The endpoint must be launched with "
164
+ "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
165
+ )
159
166
 
160
167
  @classmethod
161
168
  def from_cli_args(cls, args: argparse.Namespace):
@@ -169,6 +176,7 @@ def throughput_test_once(
169
176
  reqs: List[Tuple[str, int, int]],
170
177
  ignore_eos: bool,
171
178
  extra_request_body: Dict,
179
+ profile: bool,
172
180
  ):
173
181
  measurement_results = {
174
182
  "backend": backend_name,
@@ -194,7 +202,15 @@ def throughput_test_once(
194
202
  ]
195
203
 
196
204
  st = time.perf_counter()
205
+ if profile:
206
+ backend.start_profile()
207
+
197
208
  gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
209
+
210
+ if profile:
211
+ backend.stop_profile()
212
+ monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
213
+
198
214
  latency = time.perf_counter() - st
199
215
 
200
216
  if backend_name == "runtime":
@@ -221,6 +237,41 @@ def throughput_test_once(
221
237
  return measurement_results
222
238
 
223
239
 
240
+ def monitor_trace_file(directory, interval=1):
241
+
242
+ print(f"Monitoring {directory} for new trace files...")
243
+
244
+ known_files = set(os.listdir(directory))
245
+
246
+ while True:
247
+ flag = False
248
+ time.sleep(interval)
249
+ current_files = set(os.listdir(directory))
250
+
251
+ new_files = current_files - known_files
252
+ for new_file in new_files:
253
+ new_file_path = os.path.join(directory, new_file)
254
+ print(f"New file detected: {new_file}")
255
+
256
+ previous_size = 0
257
+ while True:
258
+ try:
259
+ current_size = os.path.getsize(new_file_path)
260
+ except FileNotFoundError:
261
+ print(f"File {new_file} is no longer accessible.")
262
+ break
263
+
264
+ if current_size > previous_size:
265
+ previous_size = current_size
266
+ else:
267
+ flag = True
268
+ break
269
+
270
+ time.sleep(interval)
271
+ if flag:
272
+ break
273
+
274
+
224
275
  def throughput_test(
225
276
  server_args: ServerArgs,
226
277
  bench_args: BenchArgs,
@@ -268,6 +319,7 @@ def throughput_test(
268
319
  reqs=warmup_requests,
269
320
  ignore_eos=not bench_args.disable_ignore_eos,
270
321
  extra_request_body=extra_request_body,
322
+ profile=False,
271
323
  )
272
324
 
273
325
  logging.info("\nBenchmark...")
@@ -277,6 +329,7 @@ def throughput_test(
277
329
  reqs=input_requests,
278
330
  ignore_eos=not bench_args.disable_ignore_eos,
279
331
  extra_request_body=extra_request_body,
332
+ profile=bench_args.profile,
280
333
  )
281
334
 
282
335
  if bench_args.result_filename:
@@ -47,6 +47,7 @@ import itertools
47
47
  import json
48
48
  import logging
49
49
  import multiprocessing
50
+ import os
50
51
  import time
51
52
  from typing import Tuple
52
53
 
@@ -62,11 +63,7 @@ from sglang.srt.model_executor.model_runner import ModelRunner
62
63
  from sglang.srt.sampling.sampling_params import SamplingParams
63
64
  from sglang.srt.server import _set_envs_and_config
64
65
  from sglang.srt.server_args import PortArgs, ServerArgs
65
- from sglang.srt.utils import (
66
- configure_logger,
67
- kill_child_process,
68
- suppress_other_loggers,
69
- )
66
+ from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
70
67
 
71
68
 
72
69
  @dataclasses.dataclass
@@ -114,8 +111,12 @@ def load_model(server_args, port_args, tp_rank):
114
111
  model_config = ModelConfig(
115
112
  server_args.model_path,
116
113
  trust_remote_code=server_args.trust_remote_code,
114
+ revision=server_args.revision,
117
115
  context_length=server_args.context_length,
118
116
  model_override_args=server_args.json_model_override_args,
117
+ is_embedding=server_args.is_embedding,
118
+ dtype=server_args.dtype,
119
+ quantization=server_args.quantization,
119
120
  )
120
121
  model_runner = ModelRunner(
121
122
  model_config=model_config,
@@ -468,4 +469,4 @@ if __name__ == "__main__":
468
469
  main(server_args, bench_args)
469
470
  finally:
470
471
  if server_args.tp_size != 1:
471
- kill_child_process()
472
+ kill_process_tree(os.getpid(), include_parent=False)
@@ -15,6 +15,7 @@ import dataclasses
15
15
  import itertools
16
16
  import json
17
17
  import multiprocessing
18
+ import os
18
19
  import time
19
20
  from typing import Tuple
20
21
 
@@ -23,7 +24,7 @@ import requests
23
24
 
24
25
  from sglang.srt.server import launch_server
25
26
  from sglang.srt.server_args import ServerArgs
26
- from sglang.srt.utils import kill_child_process
27
+ from sglang.srt.utils import kill_process_tree
27
28
 
28
29
 
29
30
  @dataclasses.dataclass
@@ -69,7 +70,7 @@ def launch_server_internal(server_args):
69
70
  except Exception as e:
70
71
  raise e
71
72
  finally:
72
- kill_child_process()
73
+ kill_process_tree(os.getpid(), include_parent=False)
73
74
 
74
75
 
75
76
  def launch_server_process(server_args: ServerArgs):
@@ -175,7 +176,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
175
176
  )
176
177
  finally:
177
178
  if proc:
178
- kill_child_process(proc.pid, include_self=True)
179
+ kill_process_tree(proc.pid)
179
180
 
180
181
  print(f"\nResults are saved to {bench_args.result_filename}")
181
182
 
@@ -51,6 +51,7 @@ class RequestFuncInput:
51
51
  prompt_len: int
52
52
  output_len: int
53
53
  model: str
54
+ lora_name: str
54
55
  extra_request_body: Dict[str, Any]
55
56
 
56
57
 
@@ -319,6 +320,7 @@ async def async_request_sglang_generate(
319
320
  "ignore_eos": not args.disable_ignore_eos,
320
321
  },
321
322
  "stream": not args.disable_stream,
323
+ "lora_path": request_func_input.lora_name,
322
324
  **request_func_input.extra_request_body,
323
325
  }
324
326
  headers = {}
@@ -884,6 +886,7 @@ async def benchmark(
884
886
  request_rate: float,
885
887
  max_concurrency: Optional[int],
886
888
  disable_tqdm: bool,
889
+ lora_name: str,
887
890
  extra_request_body: Dict[str, Any],
888
891
  profile: bool,
889
892
  ):
@@ -909,6 +912,7 @@ async def benchmark(
909
912
  api_url=api_url,
910
913
  prompt_len=test_prompt_len,
911
914
  output_len=test_output_len,
915
+ lora_name=lora_name,
912
916
  extra_request_body=extra_request_body,
913
917
  )
914
918
  test_output = await request_func(request_func_input=test_input)
@@ -942,6 +946,7 @@ async def benchmark(
942
946
  api_url=api_url,
943
947
  prompt_len=prompt_len,
944
948
  output_len=output_len,
949
+ lora_name=lora_name,
945
950
  extra_request_body=extra_request_body,
946
951
  )
947
952
  tasks.append(
@@ -1247,6 +1252,7 @@ def run_benchmark(args_: argparse.Namespace):
1247
1252
  request_rate=args.request_rate,
1248
1253
  max_concurrency=args.max_concurrency,
1249
1254
  disable_tqdm=args.disable_tqdm,
1255
+ lora_name=args.lora_name,
1250
1256
  extra_request_body=extra_request_body,
1251
1257
  profile=args.profile,
1252
1258
  )
@@ -1267,6 +1273,7 @@ def run_benchmark(args_: argparse.Namespace):
1267
1273
  request_rate=rate,
1268
1274
  max_concurrency=args.max_concurrency,
1269
1275
  disable_tqdm=args.disable_tqdm,
1276
+ lora_name=args.lora_name,
1270
1277
  extra_request_body=extra_request_body,
1271
1278
  profile=args.profile,
1272
1279
  )
@@ -1451,5 +1458,11 @@ if __name__ == "__main__":
1451
1458
  help="Use Torch Profiler. The endpoint must be launched with "
1452
1459
  "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
1453
1460
  )
1461
+ parser.add_argument(
1462
+ "--lora-name",
1463
+ type=str,
1464
+ default=None,
1465
+ help="The name of LoRA adapter",
1466
+ )
1454
1467
  args = parser.parse_args()
1455
1468
  run_benchmark(args)
@@ -9,7 +9,7 @@ from collections import OrderedDict, defaultdict
9
9
 
10
10
  import torch
11
11
 
12
- # List of packages to check versions for
12
+ # List of packages to check versions
13
13
  PACKAGE_LIST = [
14
14
  "sglang",
15
15
  "flashinfer",
@@ -1,10 +1,11 @@
1
1
  """Launch the inference server."""
2
2
 
3
+ import os
3
4
  import sys
4
5
 
5
6
  from sglang.srt.server import launch_server
6
7
  from sglang.srt.server_args import prepare_server_args
7
- from sglang.srt.utils import kill_child_process
8
+ from sglang.srt.utils import kill_process_tree
8
9
 
9
10
  if __name__ == "__main__":
10
11
  server_args = prepare_server_args(sys.argv[1:])
@@ -12,4 +13,4 @@ if __name__ == "__main__":
12
13
  try:
13
14
  launch_server(server_args)
14
15
  finally:
15
- kill_child_process()
16
+ kill_process_tree(os.getpid(), include_parent=False)
@@ -0,0 +1,118 @@
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/_custom_ops.py
2
+ import contextlib
3
+ import functools
4
+ import importlib
5
+ import logging
6
+ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
7
+
8
+ import torch
9
+ import torch.library
10
+
11
+ from sglang.srt.utils import is_hpu
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ if not is_hpu():
16
+ try:
17
+ import custom_ar
18
+ except ImportError as e:
19
+ logger.warning("Failed to import from custom_ar with %r", e)
20
+
21
+
22
+ def hint_on_error(fn):
23
+
24
+ @functools.wraps(fn)
25
+ def wrapper(*args, **kwargs):
26
+ try:
27
+ return fn(*args, **kwargs)
28
+
29
+ except NotImplementedError as e:
30
+ msg = (
31
+ "Error in calling custom op %s: %s\n"
32
+ "Not implemented or built, mostly likely because the current current device "
33
+ "does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
34
+ "incorrectly while building)"
35
+ )
36
+ logger.error(msg, fn.__name__, e)
37
+ raise NotImplementedError(msg % (fn.__name__, e)) from e
38
+ except AttributeError as e:
39
+ msg = (
40
+ "Error in calling custom op %s: %s\n"
41
+ "Possibly you have built or installed an obsolete version of vllm.\n"
42
+ "Please try a clean build and install of vllm,"
43
+ "or remove old built files such as vllm/*cpython*.so and build/ ."
44
+ )
45
+ logger.error(msg, fn.__name__, e)
46
+ raise e
47
+
48
+ return wrapper
49
+
50
+
51
+ # custom ar
52
+ def init_custom_ar(
53
+ ipc_tensors: List[torch.Tensor],
54
+ rank_data: torch.Tensor,
55
+ rank: int,
56
+ full_nvlink: bool,
57
+ ) -> int:
58
+ return torch.ops._C_vllm_ar.init_custom_ar(
59
+ ipc_tensors, rank_data, rank, full_nvlink
60
+ )
61
+
62
+
63
+ def all_reduce(
64
+ fa: int,
65
+ inp: torch.Tensor,
66
+ out: torch.Tensor,
67
+ reg_buffer: int,
68
+ reg_buffer_sz_bytes: int,
69
+ ) -> None:
70
+ torch.ops._C_vllm_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
71
+
72
+
73
+ def dispose(fa: int) -> None:
74
+ torch.ops._C_vllm_ar.dispose(fa)
75
+
76
+
77
+ def meta_size() -> int:
78
+ return torch.ops._C_vllm_ar.meta_size()
79
+
80
+
81
+ def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
82
+ return torch.ops._C_vllm_ar.register_buffer(fa, ipc_tensors)
83
+
84
+
85
+ def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
86
+ return torch.ops._C_vllm_ar.get_graph_buffer_ipc_meta(fa)
87
+
88
+
89
+ def register_graph_buffers(
90
+ fa: int, handles: List[List[int]], offsets: List[List[int]]
91
+ ) -> None:
92
+ torch.ops._C_vllm_ar.register_graph_buffers(fa, handles, offsets)
93
+
94
+
95
+ # temporary fix for https://github.com/vllm-project/vllm/issues/5456
96
+ # TODO: remove this in v0.6.0
97
+ names_and_values = globals()
98
+ names_and_values_to_update = {}
99
+ # prepare variables to avoid dict size change during iteration
100
+ k, v, arg = None, None, None
101
+ fn_type = type(lambda x: x)
102
+ for k, v in names_and_values.items():
103
+ # find functions that are defined in this file and have torch.Tensor
104
+ # in their annotations. `arg == "torch.Tensor"` is used to handle
105
+ # the case when users use `import __annotations__` to turn type
106
+ # hints into strings.
107
+ if (
108
+ isinstance(v, fn_type)
109
+ and v.__code__.co_filename == __file__
110
+ and any(
111
+ arg is torch.Tensor or arg == "torch.Tensor"
112
+ for arg in v.__annotations__.values()
113
+ )
114
+ ):
115
+ names_and_values_to_update[k] = hint_on_error(v)
116
+
117
+ names_and_values.update(names_and_values_to_update)
118
+ del names_and_values_to_update, names_and_values, v, k, fn_type
@@ -0,0 +1,17 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ import torch
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class DeviceConfig:
10
+ device: Optional[torch.device]
11
+
12
+ def __init__(self, device: str = "cuda") -> None:
13
+ if device in ["cuda", "xpu", "hpu"]:
14
+ self.device_type = device
15
+ else:
16
+ raise RuntimeError(f"Not supported device type: {device}")
17
+ self.device = torch.device(self.device_type)
@@ -0,0 +1,84 @@
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
2
+ import enum
3
+ import json
4
+ import logging
5
+ from dataclasses import dataclass, field
6
+ from typing import List, Optional, Union
7
+
8
+ from sglang.srt.utils import is_hip
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class LoadFormat(str, enum.Enum):
14
+ AUTO = "auto"
15
+ PT = "pt"
16
+ SAFETENSORS = "safetensors"
17
+ NPCACHE = "npcache"
18
+ DUMMY = "dummy"
19
+ SHARDED_STATE = "sharded_state"
20
+ GGUF = "gguf"
21
+ BITSANDBYTES = "bitsandbytes"
22
+ MISTRAL = "mistral"
23
+
24
+
25
+ @dataclass
26
+ class LoadConfig:
27
+ """
28
+ download_dir: Directory to download and load the weights, default to the
29
+ default cache directory of huggingface.
30
+ load_format: The format of the model weights to load:
31
+ "auto" will try to load the weights in the safetensors format and
32
+ fall back to the pytorch bin format if safetensors format is
33
+ not available.
34
+ "pt" will load the weights in the pytorch bin format.
35
+ "safetensors" will load the weights in the safetensors format.
36
+ "npcache" will load the weights in pytorch format and store
37
+ a numpy cache to speed up the loading.
38
+ "dummy" will initialize the weights with random values, which is
39
+ mainly for profiling.
40
+ "bitsandbytes" will load nf4 type weights.
41
+ ignore_patterns: The list of patterns to ignore when loading the model.
42
+ Default to "original/**/*" to avoid repeated loading of llama's
43
+ checkpoints.
44
+
45
+ """
46
+
47
+ load_format: Union[str, LoadFormat] = LoadFormat.AUTO
48
+ download_dir: Optional[str] = None
49
+ model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
50
+ ignore_patterns: Optional[Union[List[str], str]] = None
51
+
52
+ def __post_init__(self):
53
+ model_loader_extra_config = self.model_loader_extra_config or {}
54
+ if isinstance(model_loader_extra_config, str):
55
+ self.model_loader_extra_config = json.loads(model_loader_extra_config)
56
+ self._verify_load_format()
57
+
58
+ if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
59
+ logger.info(
60
+ "Ignoring the following patterns when downloading weights: %s",
61
+ self.ignore_patterns,
62
+ )
63
+ else:
64
+ self.ignore_patterns = ["original/**/*"]
65
+
66
+ def _verify_load_format(self) -> None:
67
+ if not isinstance(self.load_format, str):
68
+ return
69
+
70
+ load_format = self.load_format.lower()
71
+ self.load_format = LoadFormat(load_format)
72
+
73
+ rocm_not_supported_load_format: List[str] = []
74
+ if is_hip() and load_format in rocm_not_supported_load_format:
75
+ rocm_supported_load_format = [
76
+ f
77
+ for f in LoadFormat.__members__
78
+ if (f not in rocm_not_supported_load_format)
79
+ ]
80
+ raise ValueError(
81
+ f"load format '{load_format}' is not supported in ROCm. "
82
+ f"Supported load formats are "
83
+ f"{rocm_supported_load_format}"
84
+ )