sglang 0.4.1__tar.gz → 0.4.1.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. {sglang-0.4.1 → sglang-0.4.1.post1}/PKG-INFO +4 -4
  2. {sglang-0.4.1 → sglang-0.4.1.post1}/README.md +2 -2
  3. {sglang-0.4.1 → sglang-0.4.1.post1}/pyproject.toml +5 -2
  4. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_serving.py +11 -3
  5. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/openai.py +10 -0
  6. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/xgrammar_backend.py +6 -0
  7. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -14
  8. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +17 -4
  9. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/topk.py +14 -0
  10. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/fp8_kernel.py +14 -0
  11. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/schedule_policy.py +1 -1
  12. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/scheduler.py +11 -14
  13. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/tokenizer_manager.py +54 -45
  14. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_executor/model_runner.py +0 -6
  15. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_loader/loader.py +22 -11
  16. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gemma2.py +19 -0
  17. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llama.py +2 -2
  18. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/openai_api/adapter.py +19 -0
  19. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/openai_api/protocol.py +2 -0
  20. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/sampling_params.py +9 -2
  21. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/server.py +20 -37
  22. sglang-0.4.1.post1/sglang/version.py +1 -0
  23. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/PKG-INFO +4 -4
  24. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/requires.txt +1 -1
  25. sglang-0.4.1/sglang/version.py +0 -1
  26. {sglang-0.4.1 → sglang-0.4.1.post1}/LICENSE +0 -0
  27. {sglang-0.4.1 → sglang-0.4.1.post1}/setup.cfg +0 -0
  28. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/__init__.py +0 -0
  29. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/api.py +0 -0
  30. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_latency.py +0 -0
  31. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_offline_throughput.py +0 -0
  32. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_one_batch.py +0 -0
  33. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_one_batch_server.py +0 -0
  34. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/check_env.py +0 -0
  35. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/global_config.py +0 -0
  36. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/__init__.py +0 -0
  37. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/__init__.py +0 -0
  38. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/anthropic.py +0 -0
  39. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/base_backend.py +0 -0
  40. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/litellm.py +0 -0
  41. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  42. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/vertexai.py +0 -0
  43. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/chat_template.py +0 -0
  44. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/choices.py +0 -0
  45. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/compiler.py +0 -0
  46. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/interpreter.py +0 -0
  47. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/ir.py +0 -0
  48. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/tracer.py +0 -0
  49. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/launch_server.py +0 -0
  50. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/launch_server_llavavid.py +0 -0
  51. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/llama3_eval.py +0 -0
  52. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/_custom_ops.py +0 -0
  53. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/aio_rwlock.py +0 -0
  54. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/__init__.py +0 -0
  55. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/device_config.py +0 -0
  56. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/exaone.py +0 -0
  57. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/load_config.py +0 -0
  58. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/model_config.py +0 -0
  59. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/configs/qwen2vl.py +0 -0
  60. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/__init__.py +0 -0
  61. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  62. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
  63. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  64. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/conversation.py +0 -0
  65. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/__init__.py +0 -0
  66. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/communication_op.py +0 -0
  67. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
  68. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  69. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  70. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  71. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  72. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  73. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  74. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  75. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  76. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/parallel_state.py +0 -0
  77. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/distributed/utils.py +0 -0
  78. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  79. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/activation.py +0 -0
  80. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/__init__.py +0 -0
  81. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  82. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  83. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  84. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  85. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  86. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  87. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  88. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/custom_op_util.py +0 -0
  89. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/layernorm.py +0 -0
  90. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/linear.py +0 -0
  91. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/logits_processor.py +0 -0
  92. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  93. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  94. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  95. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  96. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  97. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +0 -0
  98. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/pooler.py +0 -0
  99. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
  100. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  101. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/fp8.py +0 -0
  102. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  103. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/radix_attention.py +0 -0
  104. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/rotary_embedding.py +0 -0
  105. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/sampler.py +0 -0
  106. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  107. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  108. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/lora/lora.py +0 -0
  109. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/lora/lora_config.py +0 -0
  110. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/lora/lora_manager.py +0 -0
  111. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  112. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  113. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/image_processor.py +0 -0
  114. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/io_struct.py +0 -0
  115. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/schedule_batch.py +0 -0
  116. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/session_controller.py +0 -0
  117. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/tp_worker.py +0 -0
  118. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  119. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  120. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  121. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  122. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
  123. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  124. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/metrics/collector.py +0 -0
  125. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/metrics/func_timer.py +0 -0
  126. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/mm_utils.py +0 -0
  127. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
  128. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  129. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_loader/__init__.py +0 -0
  130. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_loader/utils.py +0 -0
  131. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_loader/weight_utils.py +0 -0
  132. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_parallel.py +0 -0
  133. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/baichuan.py +0 -0
  134. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/chatglm.py +0 -0
  135. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/commandr.py +0 -0
  136. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/dbrx.py +0 -0
  137. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/deepseek.py +0 -0
  138. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/deepseek_v2.py +0 -0
  139. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/exaone.py +0 -0
  140. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gemma.py +0 -0
  141. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  142. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gpt2.py +0 -0
  143. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  144. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/granite.py +0 -0
  145. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/grok.py +0 -0
  146. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/internlm2.py +0 -0
  147. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  148. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llama_classification.py +0 -0
  149. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llama_embedding.py +0 -0
  150. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llama_reward.py +0 -0
  151. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llava.py +0 -0
  152. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/llavavid.py +0 -0
  153. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/minicpm.py +0 -0
  154. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/minicpm3.py +0 -0
  155. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/mistral.py +0 -0
  156. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/mixtral.py +0 -0
  157. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  158. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/mllama.py +0 -0
  159. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/olmo.py +0 -0
  160. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/olmo2.py +0 -0
  161. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/olmoe.py +0 -0
  162. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/phi3_small.py +0 -0
  163. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/qwen.py +0 -0
  164. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/qwen2.py +0 -0
  165. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  166. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/qwen2_vl.py +0 -0
  167. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/registry.py +0 -0
  168. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/stablelm.py +0 -0
  169. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  170. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/xverse.py +0 -0
  171. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/xverse_moe.py +0 -0
  172. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/models/yivl.py +0 -0
  173. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  174. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  175. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  176. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  177. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  178. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  179. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  180. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/server_args.py +0 -0
  181. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/utils.py +0 -0
  182. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  183. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  184. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/run_eval.py +0 -0
  185. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/runners.py +0 -0
  186. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_common.py +0 -0
  187. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  188. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  189. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_math.py +0 -0
  190. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  191. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  192. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  193. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_activation.py +0 -0
  194. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_block_fp8.py +0 -0
  195. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_layernorm.py +0 -0
  196. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_programs.py +0 -0
  197. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/test/test_utils.py +0 -0
  198. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang/utils.py +0 -0
  199. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/SOURCES.txt +0 -0
  200. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/dependency_links.txt +0 -0
  201. {sglang-0.4.1 → sglang-0.4.1.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.1
3
+ Version: 0.4.1.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -243,7 +243,7 @@ Requires-Dist: torch; extra == "srt"
243
243
  Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
244
244
  Requires-Dist: cuda-python; extra == "srt"
245
245
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
246
- Requires-Dist: sgl-kernel>=0.0.2.post8; extra == "srt"
246
+ Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
@@ -358,8 +358,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
358
358
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
359
359
 
360
360
  ## Adoption and Sponsorship
361
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
361
+ The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
362
362
 
363
363
  ## Acknowledgment and Citation
364
364
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
365
- Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
365
+ Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -57,8 +57,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
57
57
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
58
58
 
59
59
  ## Adoption and Sponsorship
60
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
60
+ The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
61
61
 
62
62
  ## Acknowledgment and Citation
63
63
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
64
- Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
64
+ Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.1"
7
+ version = "0.4.1.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
23
23
  "psutil", "pydantic", "python-multipart",
24
24
  "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
25
25
  "xgrammar>=0.1.6"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post8"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post10"]
27
27
 
28
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -60,6 +60,9 @@ dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
60
60
  "Homepage" = "https://github.com/sgl-project/sglang"
61
61
  "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
62
62
 
63
+ [tool.setuptools.package-data]
64
+ "sglang" = ["srt/layers/fused_moe_triton/configs/*.json"]
65
+
63
66
  [tool.setuptools.packages.find]
64
67
  exclude = [
65
68
  "assets*",
@@ -897,6 +897,7 @@ async def benchmark(
897
897
  else:
898
898
  raise ValueError(f"Unknown backend: {backend}")
899
899
 
900
+ # Limit concurrency
900
901
  # From https://github.com/vllm-project/vllm/pull/9390
901
902
  semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
902
903
 
@@ -906,6 +907,7 @@ async def benchmark(
906
907
  async with semaphore:
907
908
  return await request_func(request_func_input=request_func_input, pbar=pbar)
908
909
 
910
+ # Warmup
909
911
  print("Starting initial single prompt test run...")
910
912
  test_prompt, test_prompt_len, test_output_len = input_requests[0]
911
913
  test_input = RequestFuncInput(
@@ -924,11 +926,15 @@ async def benchmark(
924
926
  f"are correctly specified. Error: {test_output.error}"
925
927
  )
926
928
  else:
927
- requests.post(base_url + "/flush_cache")
928
929
  print("Initial test run completed. Starting main benchmark run...")
929
930
 
930
- time.sleep(1.5)
931
+ # Flush cache
932
+ if "sglang" in backend:
933
+ requests.post(base_url + "/flush_cache")
934
+
935
+ time.sleep(1.0)
931
936
 
937
+ # Start profiler
932
938
  if profile:
933
939
  print("Starting profiler...")
934
940
  profile_output = await async_request_profile(
@@ -939,6 +945,7 @@ async def benchmark(
939
945
 
940
946
  pbar = None if disable_tqdm else tqdm(total=len(input_requests))
941
947
 
948
+ # Run all requests
942
949
  benchmark_start_time = time.perf_counter()
943
950
  tasks: List[asyncio.Task] = []
944
951
  async for request in get_request(input_requests, request_rate):
@@ -959,6 +966,7 @@ async def benchmark(
959
966
  )
960
967
  outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
961
968
 
969
+ # Stop profiler
962
970
  if profile:
963
971
  print("Stopping profiler...")
964
972
  profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
@@ -968,8 +976,8 @@ async def benchmark(
968
976
  if pbar is not None:
969
977
  pbar.close()
970
978
 
979
+ # Compute metrics and print results
971
980
  benchmark_duration = time.perf_counter() - benchmark_start_time
972
-
973
981
  metrics, output_lens = calculate_metrics(
974
982
  input_requests=input_requests,
975
983
  outputs=outputs,
@@ -366,6 +366,11 @@ class OpenAI(BaseBackend):
366
366
  def openai_completion(
367
367
  client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
368
368
  ):
369
+ # if "ebnf" is in kwargs, warn and remove
370
+ if "ebnf" in kwargs:
371
+ warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
372
+ del kwargs["ebnf"]
373
+
369
374
  for attempt in range(retries):
370
375
  try:
371
376
  if is_chat:
@@ -398,6 +403,11 @@ def openai_completion(
398
403
  def openai_completion_stream(
399
404
  client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
400
405
  ):
406
+ # if "ebnf" is in kwargs, warn and remove
407
+ if "ebnf" in kwargs:
408
+ warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
409
+ del kwargs["ebnf"]
410
+
401
411
  for attempt in range(retries):
402
412
  try:
403
413
  if is_chat:
@@ -126,6 +126,12 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
126
126
  f"Skip invalid json_schema: json_schema={key_string}, {e=}"
127
127
  )
128
128
  return None
129
+ elif key_type == "ebnf":
130
+ try:
131
+ ctx = self.grammar_compiler.compile_grammar(key_string)
132
+ except RuntimeError as e:
133
+ logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
134
+ return None
129
135
  elif key_type == "regex":
130
136
  logger.warning(
131
137
  "regex hasn't been supported by xgrammar yet. This is skipped."
@@ -292,27 +292,33 @@ def extend_attention_fwd(
292
292
  BLOCK_DPE = 0
293
293
  BLOCK_DV = triton.next_power_of_2(Lv)
294
294
 
295
- if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
296
- if Lq <= 256:
297
- BLOCK_M, BLOCK_N = (128, 64)
298
- else:
299
- BLOCK_M, BLOCK_N = (32, 64)
300
- elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
301
- if Lq <= 128:
302
- BLOCK_M, BLOCK_N = (128, 128)
303
- elif Lq <= 256:
304
- BLOCK_M, BLOCK_N = (64, 64)
305
- else:
306
- BLOCK_M, BLOCK_N = (32, 64)
295
+ if is_hip_:
296
+ BLOCK_M, BLOCK_N = (64, 64)
297
+ num_warps = 4
298
+
307
299
  else:
308
- BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
300
+ if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
301
+ if Lq <= 256:
302
+ BLOCK_M, BLOCK_N = (128, 64)
303
+ else:
304
+ BLOCK_M, BLOCK_N = (32, 64)
305
+ elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
306
+ if Lq <= 128:
307
+ BLOCK_M, BLOCK_N = (128, 128)
308
+ elif Lq <= 256:
309
+ BLOCK_M, BLOCK_N = (64, 64)
310
+ else:
311
+ BLOCK_M, BLOCK_N = (32, 64)
312
+ else:
313
+ BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
314
+
315
+ num_warps = 4 if Lk <= 64 else 8
309
316
 
310
317
  sm_scale = sm_scale or 1.0 / (Lq**0.5)
311
318
  batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
312
319
  kv_group_num = q_extend.shape[1] // k_extend.shape[1]
313
320
 
314
321
  grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
315
- num_warps = 4 if Lk <= 64 else 8
316
322
  num_stages = 1
317
323
 
318
324
  extra_kargs = {}
@@ -11,12 +11,17 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
11
11
  import torch
12
12
  import triton
13
13
  import triton.language as tl
14
- from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
15
14
  from vllm import _custom_ops as ops
16
15
 
17
16
  from sglang.srt.layers.moe.topk import select_experts
18
17
  from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
19
- from sglang.srt.utils import direct_register_custom_op, get_device_name
18
+ from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
19
+
20
+ not_hip = False
21
+ if not is_hip():
22
+ from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
23
+
24
+ not_hip = True
20
25
 
21
26
  logger = logging.getLogger(__name__)
22
27
  padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
@@ -267,8 +272,14 @@ def moe_align_block_size(
267
272
  (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
268
273
  )
269
274
  num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
270
- # FIXME(zhyncs)
271
- if num_experts >= 256:
275
+ if not_hip and num_experts >= 224:
276
+ token_cnts_buffer = torch.empty(
277
+ (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
278
+ )
279
+ cumsum_buffer = torch.empty(
280
+ num_experts + 1, dtype=torch.int32, device=topk_ids.device
281
+ )
282
+
272
283
  sgl_moe_align_block_size(
273
284
  topk_ids,
274
285
  num_experts,
@@ -276,6 +287,8 @@ def moe_align_block_size(
276
287
  sorted_ids,
277
288
  expert_ids,
278
289
  num_tokens_post_pad,
290
+ token_cnts_buffer,
291
+ cumsum_buffer,
279
292
  )
280
293
  else:
281
294
  ops.moe_align_block_size(
@@ -1,3 +1,17 @@
1
+ # Copyright 2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
1
15
  from typing import Callable, Optional
2
16
 
3
17
  import torch
@@ -1,3 +1,17 @@
1
+ # Copyright 2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
1
15
  from typing import List, Tuple
2
16
 
3
17
  import torch
@@ -248,7 +248,7 @@ class PrefillAdder:
248
248
  self.can_run_list.append(req)
249
249
 
250
250
  self._prefill_one_req(
251
- len(req.prefix_indices),
251
+ 0,
252
252
  req.extend_input_len,
253
253
  (
254
254
  min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)
@@ -468,9 +468,6 @@ class Scheduler:
468
468
  self.send_to_tokenizer.send_pyobj(
469
469
  UpdateWeightFromDiskReqOutput(success, message)
470
470
  )
471
- elif isinstance(recv_req, GetWeightsByNameReqInput):
472
- parameter = self.get_weights_by_name(recv_req)
473
- self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
474
471
  elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
475
472
  success, message = self.init_weights_update_group(recv_req)
476
473
  self.send_to_tokenizer.send_pyobj(
@@ -565,7 +562,7 @@ class Scheduler:
565
562
 
566
563
  if req.logprob_start_len == -1:
567
564
  # By default, only return the logprobs for output tokens
568
- req.logprob_start_len = len(recv_req.input_ids) - 1
565
+ req.logprob_start_len = len(req.origin_input_ids) - 1
569
566
 
570
567
  # Truncate prompts that are too long
571
568
  if len(req.origin_input_ids) > self.max_req_input_len:
@@ -589,12 +586,15 @@ class Scheduler:
589
586
  if (
590
587
  req.sampling_params.json_schema is not None
591
588
  or req.sampling_params.regex is not None
589
+ or req.sampling_params.ebnf is not None
592
590
  ):
593
591
  assert self.grammar_backend is not None
594
592
  if req.sampling_params.json_schema is not None:
595
593
  key = ("json", req.sampling_params.json_schema)
596
594
  elif req.sampling_params.regex is not None:
597
595
  key = ("regex", req.sampling_params.regex)
596
+ elif req.sampling_params.ebnf is not None:
597
+ key = ("ebnf", req.sampling_params.ebnf)
598
598
 
599
599
  req.grammar = self.grammar_backend.get_cached_value(key)
600
600
  if not req.grammar:
@@ -629,16 +629,13 @@ class Scheduler:
629
629
  self.waiting_queue.append(req)
630
630
 
631
631
  def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
632
- if isinstance(self.tree_cache, RadixCache):
633
- self.tree_cache_metrics["total"] += (
634
- adder.log_input_tokens + adder.log_hit_tokens
635
- ) / 10**9
636
- self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
637
- tree_cache_hit_rate = (
638
- self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
639
- )
640
- else:
641
- tree_cache_hit_rate = 0.0
632
+ self.tree_cache_metrics["total"] += (
633
+ adder.log_input_tokens + adder.log_hit_tokens
634
+ ) / 10**9
635
+ self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
636
+ tree_cache_hit_rate = (
637
+ self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
638
+ )
642
639
 
643
640
  num_used = self.max_total_num_tokens - (
644
641
  self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
@@ -22,7 +22,7 @@ import signal
22
22
  import sys
23
23
  import time
24
24
  import uuid
25
- from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
25
+ from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
26
26
 
27
27
  import fastapi
28
28
  import uvloop
@@ -173,6 +173,15 @@ class TokenizerManager:
173
173
 
174
174
  # Others
175
175
  self.gracefully_exit = False
176
+ self.init_weights_update_group_communicator = _Communicator(
177
+ self.send_to_scheduler, server_args.dp_size
178
+ )
179
+ self.update_weights_from_distributed_communicator = _Communicator(
180
+ self.send_to_scheduler, server_args.dp_size
181
+ )
182
+ self.get_weights_by_name_communicator = _Communicator(
183
+ self.send_to_scheduler, server_args.dp_size
184
+ )
176
185
 
177
186
  # Metrics
178
187
  if self.enable_metrics:
@@ -190,8 +199,7 @@ class TokenizerManager:
190
199
  ):
191
200
  created_time = time.time()
192
201
 
193
- if self.to_create_loop:
194
- self.create_handle_loop()
202
+ self.auto_create_handle_loop()
195
203
 
196
204
  if isinstance(obj, EmbeddingReqInput) and self.is_generation:
197
205
  raise ValueError(
@@ -440,8 +448,7 @@ class TokenizerManager:
440
448
  obj: UpdateWeightFromDiskReqInput,
441
449
  request: Optional[fastapi.Request] = None,
442
450
  ) -> Tuple[bool, str]:
443
- if self.to_create_loop:
444
- self.create_handle_loop()
451
+ self.auto_create_handle_loop()
445
452
 
446
453
  # default the load format to the server_args
447
454
  if obj.load_format is None:
@@ -456,7 +463,7 @@ class TokenizerManager:
456
463
 
457
464
  async def _wait_for_model_update_from_disk(
458
465
  self, obj: UpdateWeightFromDiskReqInput
459
- ) -> Tuple[bool, str, int]:
466
+ ) -> Tuple[bool, str]:
460
467
  self.send_to_scheduler.send_pyobj(obj)
461
468
  self.model_update_result = asyncio.Future()
462
469
  if self.server_args.dp_size == 1:
@@ -485,15 +492,11 @@ class TokenizerManager:
485
492
  obj: InitWeightsUpdateGroupReqInput,
486
493
  request: Optional[fastapi.Request] = None,
487
494
  ) -> Tuple[bool, str]:
488
- if self.to_create_loop:
489
- self.create_handle_loop()
490
- self.send_to_scheduler.send_pyobj(obj)
491
-
492
- self.init_weights_update_group_result = asyncio.Future()
495
+ self.auto_create_handle_loop()
493
496
  assert (
494
497
  self.server_args.dp_size == 1
495
498
  ), "dp_size must be 1 for init parameter update group"
496
- result = await self.init_weights_update_group_result
499
+ result = (await self.init_weights_update_group_communicator(obj))[0]
497
500
  return result.success, result.message
498
501
 
499
502
  async def update_weights_from_distributed(
@@ -501,44 +504,32 @@ class TokenizerManager:
501
504
  obj: UpdateWeightsFromDistributedReqInput,
502
505
  request: Optional[fastapi.Request] = None,
503
506
  ) -> Tuple[bool, str]:
504
- if self.to_create_loop:
505
- self.create_handle_loop()
507
+ self.auto_create_handle_loop()
508
+ assert (
509
+ self.server_args.dp_size == 1
510
+ ), "dp_size must be for update weights from distributed"
506
511
 
507
512
  # This means that weight sync
508
513
  # cannot run while requests are in progress.
509
514
  async with self.model_update_lock.writer_lock:
510
- self.send_to_scheduler.send_pyobj(obj)
511
- self.parameter_update_result: Awaitable[
512
- UpdateWeightsFromDistributedReqOutput
513
- ] = asyncio.Future()
514
- assert (
515
- self.server_args.dp_size == 1
516
- ), "dp_size must be for update weights from distributed"
517
- result = await self.parameter_update_result
515
+ result = (await self.update_weights_from_distributed_communicator(obj))[0]
518
516
  return result.success, result.message
519
517
 
520
518
  async def get_weights_by_name(
521
519
  self, obj: GetWeightsByNameReqInput, request: Optional[fastapi.Request] = None
522
520
  ):
523
- if self.to_create_loop:
524
- self.create_handle_loop()
525
-
526
- self.send_to_scheduler.send_pyobj(obj)
527
- self.get_weights_by_name_result = asyncio.Future()
521
+ self.auto_create_handle_loop()
522
+ results = await self.get_weights_by_name_communicator(obj)
523
+ all_parameters = [r.parameter for r in results]
528
524
  if self.server_args.dp_size == 1:
529
- result = await self.get_weights_by_name_result
530
- return result.parameter
525
+ return all_parameters[0]
531
526
  else:
532
- self.get_weights_by_name_tmp = []
533
- result = await self.get_weights_by_name_result
534
- all_parameters = [r.parameter for r in result]
535
527
  return all_parameters
536
528
 
537
529
  async def open_session(
538
530
  self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
539
531
  ):
540
- if self.to_create_loop:
541
- self.create_handle_loop()
532
+ self.auto_create_handle_loop()
542
533
 
543
534
  session_id = uuid.uuid4().hex
544
535
  obj.session_id = session_id
@@ -568,7 +559,7 @@ class TokenizerManager:
568
559
  background_tasks.add_task(abort_request)
569
560
  return background_tasks
570
561
 
571
- def create_handle_loop(self):
562
+ def auto_create_handle_loop(self):
572
563
  if not self.to_create_loop:
573
564
  return
574
565
 
@@ -711,21 +702,14 @@ class TokenizerManager:
711
702
  assert (
712
703
  self.server_args.dp_size == 1
713
704
  ), "dp_size must be 1 for init parameter update group"
714
- self.init_weights_update_group_result.set_result(recv_obj)
705
+ self.init_weights_update_group_communicator.handle_recv(recv_obj)
715
706
  elif isinstance(recv_obj, UpdateWeightsFromDistributedReqOutput):
716
707
  assert (
717
708
  self.server_args.dp_size == 1
718
709
  ), "dp_size must be 1 for update weights from distributed"
719
- self.parameter_update_result.set_result(recv_obj)
710
+ self.update_weights_from_distributed_communicator.handle_recv(recv_obj)
720
711
  elif isinstance(recv_obj, GetWeightsByNameReqOutput):
721
- if self.server_args.dp_size == 1:
722
- self.get_weights_by_name_result.set_result(recv_obj)
723
- else:
724
- self.get_weights_by_name_tmp.append(recv_obj)
725
- if len(self.get_weights_by_name_tmp) == self.server_args.dp_size:
726
- self.get_weights_by_name_result.set_result(
727
- self.get_weights_by_name_tmp
728
- )
712
+ self.get_weights_by_name_communicator.handle_recv(recv_obj)
729
713
  else:
730
714
  raise ValueError(f"Invalid object: {recv_obj=}")
731
715
 
@@ -809,3 +793,28 @@ class SignalHandler:
809
793
  f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
810
794
  )
811
795
  self.tokenizer_manager.gracefully_exit = True
796
+
797
+
798
+ T = TypeVar("T")
799
+
800
+
801
+ class _Communicator(Generic[T]):
802
+ def __init__(self, sender, fan_out: int):
803
+ self._sender = sender
804
+ self._fan_out = fan_out
805
+ self._result_future: Optional[asyncio.Future] = None
806
+ self._result_values: Optional[List[T]] = None
807
+
808
+ async def __call__(self, obj):
809
+ self._sender.send_pyobj(obj)
810
+ self._result_future = asyncio.Future()
811
+ self._result_values = []
812
+ await self._result_future
813
+ result_values = self._result_values
814
+ self._result_future = self._result_values = None
815
+ return result_values
816
+
817
+ def handle_recv(self, recv_obj: T):
818
+ self._result_values.append(recv_obj)
819
+ if len(self._result_values) == self._fan_out:
820
+ self._result_future.set_result(None)
@@ -95,12 +95,6 @@ class ModelRunner:
95
95
  ):
96
96
  logger.info("MLA optimization is turned on. Use triton backend.")
97
97
  self.server_args.attention_backend = "triton"
98
- # FIXME(HandH1998)
99
- if (
100
- "DeepseekV3ForCausalLM" in self.model_config.hf_config.architectures
101
- and not self.server_args.disable_cuda_graph
102
- ):
103
- self.server_args.disable_cuda_graph = True
104
98
 
105
99
  if self.server_args.enable_double_sparsity:
106
100
  logger.info(
@@ -770,6 +770,21 @@ class BitsAndBytesModelLoader(BaseModelLoader):
770
770
  quant_state_dict,
771
771
  )
772
772
 
773
+ def _is_8bit_weight_name(self, weight_name: str):
774
+ quantized_suffix = {".scb", ".weight_format"}
775
+ return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)
776
+
777
+ def _is_4bit_weight_name(self, weight_name: str):
778
+ quantized_suffix = {
779
+ "absmax",
780
+ "quant_map",
781
+ "nested_absmax",
782
+ "nested_quant_map",
783
+ "bitsandbytes",
784
+ }
785
+ suffix = weight_name.split(".")[-1]
786
+ return any(q_suffix in suffix for q_suffix in quantized_suffix)
787
+
773
788
  def _quantized_8bit_generator(
774
789
  self, hf_weights_files, use_safetensors, quant_state_dict
775
790
  ) -> Generator:
@@ -779,21 +794,18 @@ class BitsAndBytesModelLoader(BaseModelLoader):
779
794
  if not weight_name.lower().endswith(".scb"):
780
795
  continue
781
796
 
782
- weight_key = weight_name.lower().replace(".scb", ".qweight")
797
+ weight_key = weight_name.lower().replace(".scb", ".weight")
783
798
  quant_state_dict[weight_key] = weight_tensor
784
799
 
785
800
  for weight_name, weight_tensor in self._hf_weight_iter(
786
801
  hf_weights_files, use_safetensors
787
802
  ):
788
-
789
- if not weight_name.endswith((".weight", ".bias")):
803
+ if self._is_8bit_weight_name(weight_name):
790
804
  continue
791
805
 
792
- qweight_name = weight_name.replace(".weight", ".qweight")
793
-
794
- if qweight_name in quant_state_dict:
806
+ if weight_name in quant_state_dict:
795
807
  set_weight_attrs(weight_tensor, {"load_in_8bit": True})
796
- yield qweight_name, weight_tensor
808
+ yield weight_name, weight_tensor
797
809
  else:
798
810
  yield weight_name, weight_tensor
799
811
 
@@ -806,7 +818,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
806
818
  weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
807
819
  temp_state_dict = {}
808
820
  for weight_name, weight_tensor in weight_iterator:
809
- if weight_name.endswith((".weight", ".bias")):
821
+ if not self._is_4bit_weight_name(weight_name):
810
822
  continue
811
823
  # bitsandbytes library requires
812
824
  # weight.quant_state.bitsandbytes__* in CPU
@@ -830,16 +842,15 @@ class BitsAndBytesModelLoader(BaseModelLoader):
830
842
  hf_weights_files, use_safetensors
831
843
  ):
832
844
 
833
- if not weight_name.endswith((".weight", ".bias")):
845
+ if self._is_4bit_weight_name(weight_name):
834
846
  continue
835
847
 
836
848
  if (f"{weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or (
837
849
  f"{weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
838
850
  ):
839
851
  quant_state = _parse_quant_state(weight_name, temp_state_dict)
840
- weight_name = weight_name.replace(".weight", ".qweight")
841
852
  quant_state_dict[weight_name] = quant_state
842
- yield weight_name.replace(".weight", ".qweight"), weight_tensor
853
+ yield weight_name, weight_tensor
843
854
  else:
844
855
  yield weight_name, weight_tensor
845
856