sglang 0.3.5.post2__tar.gz → 0.3.6.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/LICENSE +1 -1
  2. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/PKG-INFO +28 -19
  3. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/README.md +11 -13
  4. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/pyproject.toml +14 -6
  5. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/__init__.py +2 -2
  6. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/api.py +2 -2
  7. sglang-0.3.6.post1/sglang/bench_latency.py +1 -0
  8. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/bench_offline_throughput.py +48 -20
  9. sglang-0.3.5.post2/sglang/bench_latency.py → sglang-0.3.6.post1/sglang/bench_one_batch.py +21 -102
  10. sglang-0.3.5.post2/sglang/bench_server_latency.py → sglang-0.3.6.post1/sglang/bench_one_batch_server.py +3 -3
  11. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/bench_serving.py +125 -6
  12. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/check_env.py +3 -6
  13. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/base_backend.py +1 -1
  14. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/runtime_endpoint.py +2 -2
  15. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/configs/model_config.py +13 -14
  16. sglang-0.3.6.post1/sglang/srt/constrained/__init__.py +16 -0
  17. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/constrained/base_grammar_backend.py +13 -15
  18. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/constrained/outlines_backend.py +28 -17
  19. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/constrained/outlines_jump_forward.py +13 -15
  20. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/constrained/xgrammar_backend.py +47 -58
  21. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/conversation.py +13 -15
  22. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/hf_transformers_utils.py +13 -15
  23. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/activation.py +16 -13
  24. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/flashinfer_backend.py +106 -54
  25. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_backend.py +9 -7
  26. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
  27. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
  28. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
  29. sglang-0.3.6.post1/sglang/srt/layers/custom_op_util.py +25 -0
  30. sglang-0.3.6.post1/sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
  31. {sglang-0.3.5.post2/sglang/srt/layers/fused_moe → sglang-0.3.6.post1/sglang/srt/layers/fused_moe_grok}/fused_moe.py +11 -4
  32. {sglang-0.3.5.post2/sglang/srt/layers/fused_moe → sglang-0.3.6.post1/sglang/srt/layers/fused_moe_grok}/layer.py +4 -9
  33. sglang-0.3.5.post2/sglang/srt/layers/fused_moe/patch.py → sglang-0.3.6.post1/sglang/srt/layers/fused_moe_patch.py +5 -0
  34. sglang-0.3.6.post1/sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
  35. sglang-0.3.6.post1/sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
  36. sglang-0.3.6.post1/sglang/srt/layers/fused_moe_triton/layer.py +633 -0
  37. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/layernorm.py +17 -15
  38. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/logits_processor.py +23 -25
  39. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/quantization/__init__.py +77 -17
  40. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/radix_attention.py +13 -15
  41. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/rotary_embedding.py +13 -13
  42. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/sampler.py +4 -8
  43. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/torchao_utils.py +2 -0
  44. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/lora/lora.py +13 -14
  45. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/lora/lora_config.py +13 -14
  46. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/lora/lora_manager.py +22 -24
  47. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/data_parallel_controller.py +98 -27
  48. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/detokenizer_manager.py +13 -15
  49. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/io_struct.py +63 -21
  50. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/schedule_batch.py +154 -59
  51. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/schedule_policy.py +18 -16
  52. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/scheduler.py +278 -109
  53. sglang-0.3.6.post1/sglang/srt/managers/session_controller.py +61 -0
  54. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/tokenizer_manager.py +63 -18
  55. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/tp_worker.py +25 -16
  56. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
  57. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/metrics/collector.py +13 -15
  58. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/metrics/func_timer.py +13 -15
  59. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mm_utils.py +13 -14
  60. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/model_executor/cuda_graph_runner.py +63 -25
  61. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/model_executor/forward_batch_info.py +128 -32
  62. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/model_executor/model_runner.py +132 -64
  63. sglang-0.3.6.post1/sglang/srt/model_parallel.py +98 -0
  64. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/chatglm.py +15 -16
  65. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/commandr.py +15 -16
  66. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/dbrx.py +15 -16
  67. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/deepseek.py +15 -15
  68. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/deepseek_v2.py +162 -59
  69. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/exaone.py +14 -15
  70. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gemma.py +14 -14
  71. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gemma2.py +31 -25
  72. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gemma2_reward.py +13 -14
  73. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gpt_bigcode.py +14 -14
  74. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/grok.py +15 -15
  75. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/internlm2.py +13 -15
  76. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/internlm2_reward.py +13 -14
  77. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llama.py +21 -21
  78. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llama_classification.py +13 -14
  79. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llama_reward.py +13 -14
  80. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llava.py +14 -16
  81. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llavavid.py +14 -16
  82. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/minicpm.py +13 -15
  83. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/minicpm3.py +13 -15
  84. sglang-0.3.6.post1/sglang/srt/models/mistral.py +23 -0
  85. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/mixtral.py +15 -15
  86. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/mixtral_quant.py +14 -14
  87. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/olmo.py +22 -20
  88. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/olmoe.py +23 -20
  89. sglang-0.3.6.post1/sglang/srt/models/phi3_small.py +447 -0
  90. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/qwen.py +14 -14
  91. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/qwen2.py +22 -19
  92. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/qwen2_moe.py +17 -18
  93. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/qwen2_vl.py +13 -6
  94. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/stablelm.py +18 -16
  95. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/torch_native_llama.py +107 -93
  96. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/xverse.py +13 -14
  97. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/xverse_moe.py +15 -16
  98. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/yivl.py +13 -15
  99. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/openai_api/adapter.py +19 -17
  100. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/openai_api/protocol.py +14 -16
  101. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
  102. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
  103. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
  104. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
  105. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
  106. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/sampling_batch_info.py +61 -57
  107. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/sampling_params.py +14 -16
  108. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/server.py +86 -35
  109. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/server_args.py +96 -80
  110. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/utils.py +266 -68
  111. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/few_shot_gsm8k.py +8 -4
  112. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/runners.py +38 -20
  113. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/srt/sampling/penaltylib/utils.py +23 -21
  114. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/test_utils.py +31 -20
  115. sglang-0.3.6.post1/sglang/version.py +1 -0
  116. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/PKG-INFO +28 -19
  117. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/SOURCES.txt +13 -5
  118. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/requires.txt +18 -4
  119. sglang-0.3.5.post2/sglang/srt/constrained/__init__.py +0 -17
  120. sglang-0.3.5.post2/sglang/srt/layers/fused_moe/__init__.py +0 -1
  121. sglang-0.3.5.post2/sglang/srt/models/mistral.py +0 -25
  122. sglang-0.3.5.post2/sglang/version.py +0 -1
  123. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/setup.cfg +0 -0
  124. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/global_config.py +0 -0
  125. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/__init__.py +0 -0
  126. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/__init__.py +0 -0
  127. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/anthropic.py +0 -0
  128. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/litellm.py +0 -0
  129. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/openai.py +0 -0
  130. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/vertexai.py +0 -0
  131. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/chat_template.py +0 -0
  132. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/choices.py +0 -0
  133. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/compiler.py +0 -0
  134. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/interpreter.py +0 -0
  135. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/ir.py +0 -0
  136. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/tracer.py +0 -0
  137. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/launch_server.py +0 -0
  138. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/launch_server_llavavid.py +0 -0
  139. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/configs/__init__.py +0 -0
  140. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/configs/exaone.py +0 -0
  141. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/configs/qwen2vl.py +0 -0
  142. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/__init__.py +0 -0
  143. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  144. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  145. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/linear.py +0 -0
  146. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/pooler.py +0 -0
  147. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  148. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  149. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/image_processor.py +0 -0
  150. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  151. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  152. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  153. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
  154. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  155. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/baichuan.py +0 -0
  156. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gpt2.py +0 -0
  157. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llama_embedding.py +0 -0
  158. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/mllama.py +0 -0
  159. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  160. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  161. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/run_eval.py +0 -0
  162. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_common.py +0 -0
  163. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  164. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  165. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_math.py +0 -0
  166. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  167. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  168. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/test_activation.py +0 -0
  169. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/test_layernorm.py +0 -0
  170. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/test_programs.py +0 -0
  171. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/utils.py +0 -0
  172. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/dependency_links.txt +0 -0
  173. {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -186,7 +186,7 @@
186
186
  same "printed page" as the copyright notice for easier
187
187
  identification within third-party archives.
188
188
 
189
- Copyright [yyyy] [name of copyright owner]
189
+ Copyright 2023-2024 SGLang Team
190
190
 
191
191
  Licensed under the Apache License, Version 2.0 (the "License");
192
192
  you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.5.post2
3
+ Version: 0.3.6.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -190,7 +190,7 @@ License: Apache License
190
190
  same "printed page" as the copyright notice for easier
191
191
  identification within third-party archives.
192
192
 
193
- Copyright [yyyy] [name of copyright owner]
193
+ Copyright 2023-2024 SGLang Team
194
194
 
195
195
  Licensed under the Apache License, Version 2.0 (the "License");
196
196
  you may not use this file except in compliance with the License.
@@ -222,29 +222,32 @@ Requires-Dist: fastapi; extra == "runtime-common"
222
222
  Requires-Dist: hf_transfer; extra == "runtime-common"
223
223
  Requires-Dist: huggingface_hub; extra == "runtime-common"
224
224
  Requires-Dist: interegular; extra == "runtime-common"
225
+ Requires-Dist: modelscope; extra == "runtime-common"
225
226
  Requires-Dist: orjson; extra == "runtime-common"
227
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
226
228
  Requires-Dist: packaging; extra == "runtime-common"
227
229
  Requires-Dist: pillow; extra == "runtime-common"
228
230
  Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
229
231
  Requires-Dist: psutil; extra == "runtime-common"
230
232
  Requires-Dist: pydantic; extra == "runtime-common"
231
233
  Requires-Dist: python-multipart; extra == "runtime-common"
234
+ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
232
235
  Requires-Dist: torchao; extra == "runtime-common"
233
236
  Requires-Dist: uvicorn; extra == "runtime-common"
234
237
  Requires-Dist: uvloop; extra == "runtime-common"
235
- Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
236
- Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
237
- Requires-Dist: modelscope; extra == "runtime-common"
238
+ Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
238
239
  Provides-Extra: srt
239
240
  Requires-Dist: sglang[runtime_common]; extra == "srt"
240
241
  Requires-Dist: torch; extra == "srt"
241
- Requires-Dist: vllm==0.6.3.post1; extra == "srt"
242
+ Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
242
243
  Provides-Extra: srt-hip
243
244
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
244
245
  Requires-Dist: torch; extra == "srt-hip"
245
246
  Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
246
247
  Provides-Extra: srt-xpu
247
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
249
+ Provides-Extra: srt-hpu
250
+ Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
248
251
  Provides-Extra: openai
249
252
  Requires-Dist: openai>=1.0; extra == "openai"
250
253
  Requires-Dist: tiktoken; extra == "openai"
@@ -274,6 +277,11 @@ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
274
277
  Requires-Dist: sglang[openai]; extra == "all-xpu"
275
278
  Requires-Dist: sglang[anthropic]; extra == "all-xpu"
276
279
  Requires-Dist: sglang[litellm]; extra == "all-xpu"
280
+ Provides-Extra: all-hpu
281
+ Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
282
+ Requires-Dist: sglang[openai]; extra == "all-hpu"
283
+ Requires-Dist: sglang[anthropic]; extra == "all-hpu"
284
+ Requires-Dist: sglang[litellm]; extra == "all-hpu"
277
285
  Provides-Extra: dev
278
286
  Requires-Dist: sglang[all]; extra == "dev"
279
287
  Requires-Dist: sglang[test]; extra == "dev"
@@ -283,6 +291,9 @@ Requires-Dist: sglang[test]; extra == "dev-hip"
283
291
  Provides-Extra: dev-xpu
284
292
  Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
285
293
  Requires-Dist: sglang[test]; extra == "dev-xpu"
294
+ Provides-Extra: dev-hpu
295
+ Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
296
+ Requires-Dist: sglang[test]; extra == "dev-hpu"
286
297
 
287
298
  <div align="center" id="sglangtop">
288
299
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -321,21 +332,16 @@ SGLang is a fast serving framework for large language models and vision language
321
332
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
322
333
  The core features include:
323
334
 
324
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
335
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
325
336
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
326
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
337
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
327
338
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
328
339
 
329
340
  ## Getting Started
330
- Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
331
-
332
- Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
333
-
334
- ## Backend: SGLang Runtime (SRT)
335
- See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
336
-
337
- ## Frontend: Structured Generation Language (SGLang)
338
- See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
341
+ - [Install SGLang](https://sgl-project.github.io/start/install.html)
342
+ - [Send requests](https://sgl-project.github.io/start/send_request.html)
343
+ - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
344
+ - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
339
345
 
340
346
  ## Benchmark And Performance
341
347
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -343,6 +349,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
343
349
  ## Roadmap
344
350
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
345
351
 
346
- ## Citation And Acknowledgment
352
+ ## Adoption and Sponsorship
353
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
354
+
355
+ ## Acknowledgment and Citation
356
+ We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
347
357
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
348
- We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -35,21 +35,16 @@ SGLang is a fast serving framework for large language models and vision language
35
35
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
36
36
  The core features include:
37
37
 
38
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
38
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
39
39
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
40
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
40
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
41
41
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
42
42
 
43
43
  ## Getting Started
44
- Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
45
-
46
- Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
47
-
48
- ## Backend: SGLang Runtime (SRT)
49
- See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
50
-
51
- ## Frontend: Structured Generation Language (SGLang)
52
- See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
44
+ - [Install SGLang](https://sgl-project.github.io/start/install.html)
45
+ - [Send requests](https://sgl-project.github.io/start/send_request.html)
46
+ - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
47
+ - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
53
48
 
54
49
  ## Benchmark And Performance
55
50
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -57,6 +52,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
57
52
  ## Roadmap
58
53
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
59
54
 
60
- ## Citation And Acknowledgment
55
+ ## Adoption and Sponsorship
56
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
57
+
58
+ ## Acknowledgment and Citation
59
+ We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
61
60
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
62
- We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.5.post2"
7
+ version = "0.3.6.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -16,11 +16,14 @@ classifiers = [
16
16
  dependencies = ["requests", "tqdm", "numpy", "IPython"]
17
17
 
18
18
  [project.optional-dependencies]
19
- runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
20
- "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
21
- "torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
22
- "outlines>=0.0.44,<0.1.0", "modelscope"]
23
- srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
19
+ runtime_common = ["aiohttp", "decord", "fastapi",
20
+ "hf_transfer", "huggingface_hub", "interegular", "modelscope",
21
+ "orjson", "outlines>=0.0.44,<0.1.0",
22
+ "packaging", "pillow", "prometheus-client>=0.20.0",
23
+ "psutil", "pydantic", "python-multipart",
24
+ "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
25
+ "xgrammar>=0.1.4"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
24
27
 
25
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
26
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -28,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
28
31
  # xpu is not enabled in public vllm and torch whl,
29
32
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
30
33
  srt_xpu = ["sglang[runtime_common]"]
34
+ #For Intel Gaudi(device : hpu) follow the installation guide
35
+ #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
36
+ srt_hpu = ["sglang[runtime_common]"]
31
37
 
32
38
  openai = ["openai>=1.0", "tiktoken"]
33
39
  anthropic = ["anthropic>=0.20.0"]
@@ -43,9 +49,11 @@ test = [
43
49
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
44
50
  all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
45
51
  all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
52
+ all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
46
53
  dev = ["sglang[all]", "sglang[test]"]
47
54
  dev_hip = ["sglang[all_hip]", "sglang[test]"]
48
55
  dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
56
+ dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
49
57
 
50
58
  [project.urls]
51
59
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -11,7 +11,7 @@ from sglang.api import (
11
11
  gen,
12
12
  gen_int,
13
13
  gen_string,
14
- get_server_args,
14
+ get_server_info,
15
15
  image,
16
16
  select,
17
17
  set_default_backend,
@@ -41,7 +41,7 @@ __all__ = [
41
41
  "gen",
42
42
  "gen_int",
43
43
  "gen_string",
44
- "get_server_args",
44
+ "get_server_info",
45
45
  "image",
46
46
  "select",
47
47
  "set_default_backend",
@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
65
65
  return backend.flush_cache()
66
66
 
67
67
 
68
- def get_server_args(backend: Optional[BaseBackend] = None):
68
+ def get_server_info(backend: Optional[BaseBackend] = None):
69
69
  backend = backend or global_config.default_backend
70
70
  if backend is None:
71
71
  return None
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
73
73
  # If backend is Runtime
74
74
  if hasattr(backend, "endpoint"):
75
75
  backend = backend.endpoint
76
- return backend.get_server_args()
76
+ return backend.get_server_info()
77
77
 
78
78
 
79
79
  def gen(
@@ -0,0 +1 @@
1
+ raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")
@@ -1,20 +1,13 @@
1
1
  """
2
- Benchmark the throughput of using the offline LLM engine.
3
- This script does not launch a server.
2
+ Benchmark the throughput in the offline mode.
4
3
  It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
5
4
 
6
5
  # Usage
7
6
  ## Sharegpt dataset with default args
8
- python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
7
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
9
8
 
10
9
  ## Random dataset with default args
11
- python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
12
-
13
- ## Shared prefix dataset with default args
14
- python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
15
-
16
- ## Sharegpt dataset on runtime backend
17
- python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
10
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024
18
11
  """
19
12
 
20
13
  import argparse
@@ -23,7 +16,7 @@ import json
23
16
  import logging
24
17
  import random
25
18
  import time
26
- from typing import List, Optional, Tuple
19
+ from typing import Dict, List, Optional, Tuple
27
20
 
28
21
  import numpy as np
29
22
 
@@ -55,7 +48,10 @@ class BenchArgs:
55
48
  gen_question_len: int = 128
56
49
  gen_output_len: int = 256
57
50
  disable_ignore_eos: bool = False
51
+ extra_request_body: Optional[str] = None
58
52
  seed: int = 1
53
+ skip_warmup: bool = False
54
+ do_not_exit: bool = False
59
55
 
60
56
  @staticmethod
61
57
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -142,7 +138,24 @@ class BenchArgs:
142
138
  default=BenchArgs.disable_ignore_eos,
143
139
  help="Disable ignore EOS token",
144
140
  )
141
+ parser.add_argument(
142
+ "--extra-request-body",
143
+ metavar='{"key1": "value1", "key2": "value2"}',
144
+ type=str,
145
+ help="Append given JSON object to the request payload. You can use this to specify"
146
+ "additional generate params like sampling params.",
147
+ )
145
148
  parser.add_argument("--seed", type=int, default=1, help="The random seed.")
149
+ parser.add_argument(
150
+ "--skip-warmup",
151
+ action="store_true",
152
+ help="Skip the warmup batches.",
153
+ )
154
+ parser.add_argument(
155
+ "--do-not-exit",
156
+ action="store_true",
157
+ help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
158
+ )
146
159
 
147
160
  @classmethod
148
161
  def from_cli_args(cls, args: argparse.Namespace):
@@ -155,6 +168,7 @@ def throughput_test_once(
155
168
  backend,
156
169
  reqs: List[Tuple[str, int, int]],
157
170
  ignore_eos: bool,
171
+ extra_request_body: Dict,
158
172
  ):
159
173
  measurement_results = {
160
174
  "backend": backend_name,
@@ -174,6 +188,7 @@ def throughput_test_once(
174
188
  "temperature": 0,
175
189
  "max_new_tokens": r[2],
176
190
  "ignore_eos": ignore_eos,
191
+ **extra_request_body,
177
192
  }
178
193
  for r in reqs
179
194
  ]
@@ -227,31 +242,41 @@ def throughput_test(
227
242
  random.seed(bench_args.seed)
228
243
  np.random.seed(bench_args.seed)
229
244
 
245
+ # Parse args
246
+ extra_request_body = {}
247
+ if bench_args.extra_request_body:
248
+ extra_request_body = json.loads(args.extra_request_body)
249
+
230
250
  # Read dataset
231
251
  input_requests = get_dataset(bench_args, tokenizer)
232
252
 
233
253
  warmup_requests = sample_random_requests(
234
- input_len=20,
235
- output_len=4,
236
- num_prompts=2,
254
+ input_len=256,
255
+ output_len=16,
256
+ num_prompts=16,
237
257
  range_ratio=0.8,
238
258
  tokenizer=tokenizer,
239
259
  dataset_path=bench_args.dataset_path,
240
260
  )
241
261
 
242
262
  # Warm up
243
- throughput_test_once(
244
- backend_name=bench_args.backend,
245
- backend=backend,
246
- reqs=warmup_requests,
247
- ignore_eos=not bench_args.disable_ignore_eos,
248
- )
263
+ if not bench_args.skip_warmup:
264
+ logging.info("\nWarmup...")
265
+ throughput_test_once(
266
+ backend_name=bench_args.backend,
267
+ backend=backend,
268
+ reqs=warmup_requests,
269
+ ignore_eos=not bench_args.disable_ignore_eos,
270
+ extra_request_body=extra_request_body,
271
+ )
249
272
 
273
+ logging.info("\nBenchmark...")
250
274
  result = throughput_test_once(
251
275
  backend_name=bench_args.backend,
252
276
  backend=backend,
253
277
  reqs=input_requests,
254
278
  ignore_eos=not bench_args.disable_ignore_eos,
279
+ extra_request_body=extra_request_body,
255
280
  )
256
281
 
257
282
  if bench_args.result_filename:
@@ -307,3 +332,6 @@ if __name__ == "__main__":
307
332
  )
308
333
 
309
334
  throughput_test(server_args, bench_args)
335
+
336
+ while bench_args.do_not_exit:
337
+ pass
@@ -1,20 +1,17 @@
1
1
  """
2
- Benchmark the latency of running a single static batch.
2
+ Benchmark the latency of running a single static batch without a server.
3
+
3
4
  This script does not launch a server and uses the low-level APIs.
4
- It accepts arguments similar to those of launch_server.py.
5
+ It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
5
6
 
6
7
  # Usage (latency test)
7
8
  ## with dummy weights:
8
- python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
9
+ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
9
10
  ## sweep through multiple data points and store (append) the results in a jsonl file:
10
- python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
11
- ## do some changes, and store the results under a different run_name:
12
- python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
13
- ## plot the results in series of lines:
14
- python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
11
+ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
15
12
 
16
13
  # Usage (correctness test):
17
- python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
14
+ python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
18
15
 
19
16
  ## Reference output (of the correctness test above, can be gpu dependent):
20
17
  input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
@@ -50,13 +47,10 @@ import itertools
50
47
  import json
51
48
  import logging
52
49
  import multiprocessing
53
- import os
54
- import sqlite3
55
50
  import time
56
51
  from typing import Tuple
57
52
 
58
53
  import numpy as np
59
- import pandas as pd
60
54
  import torch
61
55
  import torch.distributed as dist
62
56
 
@@ -77,19 +71,14 @@ from sglang.srt.utils import (
77
71
 
78
72
  @dataclasses.dataclass
79
73
  class BenchArgs:
80
- run_name: str = "before"
74
+ run_name: str = "default"
81
75
  batch_size: Tuple[int] = (1,)
82
76
  input_len: Tuple[int] = (1024,)
83
77
  output_len: Tuple[int] = (16,)
84
- result_filename: str = ""
78
+ result_filename: str = "result.jsonl"
85
79
  correctness_test: bool = False
86
80
  # This is only used for correctness test
87
81
  cut_len: int = 4
88
- # Plotting args
89
- graph_sql: str = (
90
- "select run_name, batch_size, prefill_throughput from results where run_name='before'"
91
- )
92
- graph_filename: str = "out.png"
93
82
 
94
83
  @staticmethod
95
84
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -108,11 +97,6 @@ class BenchArgs:
108
97
  )
109
98
  parser.add_argument("--correctness-test", action="store_true")
110
99
  parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
111
- # graphing
112
- parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
113
- parser.add_argument(
114
- "--graph-filename", type=str, default=BenchArgs.graph_filename
115
- )
116
100
 
117
101
  @classmethod
118
102
  def from_cli_args(cls, args: argparse.Namespace):
@@ -220,7 +204,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
220
204
  return reqs
221
205
 
222
206
 
223
- @torch.inference_mode()
207
+ @torch.no_grad
224
208
  def extend(reqs, model_runner):
225
209
  batch = ScheduleBatch.init_new(
226
210
  reqs=reqs,
@@ -228,6 +212,7 @@ def extend(reqs, model_runner):
228
212
  token_to_kv_pool=model_runner.token_to_kv_pool,
229
213
  tree_cache=None,
230
214
  model_config=model_runner.model_config,
215
+ enable_overlap=False,
231
216
  )
232
217
  batch.prepare_for_extend()
233
218
  model_worker_batch = batch.get_model_worker_batch()
@@ -237,7 +222,7 @@ def extend(reqs, model_runner):
237
222
  return next_token_ids, logits_output.next_token_logits, batch
238
223
 
239
224
 
240
- @torch.inference_mode()
225
+ @torch.no_grad
241
226
  def decode(input_token_ids, batch, model_runner):
242
227
  batch.output_ids = input_token_ids
243
228
  batch.prepare_for_decode()
@@ -254,6 +239,7 @@ def correctness_test(
254
239
  bench_args,
255
240
  tp_rank,
256
241
  ):
242
+ # Configure the logger
257
243
  configure_logger(server_args, prefix=f" TP{tp_rank}")
258
244
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
259
245
 
@@ -274,7 +260,7 @@ def correctness_test(
274
260
  bench_args, input_ids, reqs, model_runner
275
261
  )
276
262
 
277
- # Extend
263
+ # Extend (prefill w/ KV cache)
278
264
  next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
279
265
  rank_print(f"prefill logits (final): {next_token_logits} \n")
280
266
 
@@ -286,17 +272,14 @@ def correctness_test(
286
272
  for i in range(len(reqs)):
287
273
  output_ids[i].append(next_token_ids_list[i])
288
274
 
289
- # Print
275
+ # Print output texts
290
276
  for i in range(len(reqs)):
291
277
  rank_print(f"========== Prompt {i} ==========")
292
278
  rank_print(tokenizer.decode(output_ids[i]), "\n")
293
279
 
294
280
 
295
281
  def synchronize(device):
296
- if device == "cuda":
297
- torch.cuda.synchronize()
298
- elif device == "xpu":
299
- torch.xpu.synchronize()
282
+ torch.get_device_module(device).synchronize()
300
283
 
301
284
 
302
285
  def latency_test_run_once(
@@ -352,7 +335,7 @@ def latency_test_run_once(
352
335
  f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
353
336
  )
354
337
 
355
- # record decode timing from 2nd output
338
+ # Record decode timing from 2nd output
356
339
  if output_len > 1:
357
340
  med_decode_latency = np.median(decode_latencies)
358
341
  med_decode_throughput = batch_size / med_decode_latency
@@ -367,7 +350,7 @@ def latency_test_run_once(
367
350
  f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
368
351
  )
369
352
  measurement_results["total_latency"] = tot_latency
370
- measurement_results["total_throughput"] = throughput
353
+ measurement_results["overall_throughput"] = throughput
371
354
  return measurement_results
372
355
 
373
356
 
@@ -377,6 +360,7 @@ def latency_test(
377
360
  bench_args,
378
361
  tp_rank,
379
362
  ):
363
+ # Configure the logger
380
364
  configure_logger(server_args, prefix=f" TP{tp_rank}")
381
365
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
382
366
 
@@ -423,71 +407,9 @@ def latency_test(
423
407
 
424
408
  # Write results in jsonlines format on rank 0.
425
409
  if tp_rank == 0 and bench_args.result_filename:
426
- import jsonlines
427
-
428
- with jsonlines.open(bench_args.result_filename, "a") as f:
429
- f.write_all(result_list)
430
-
431
-
432
- def plot_latency_test(
433
- server_args,
434
- bench_args,
435
- tp_rank,
436
- ):
437
- assert tp_rank == 0
438
-
439
- # read the jsonl file and put in sqlite
440
- df = pd.read_json(bench_args.result_filename, lines=True)
441
- conn = sqlite3.connect(":memory:")
442
- cur = conn.cursor()
443
-
444
- # get the columns and their types
445
- column_names = list(df.iloc[0].keys())
446
- type_dict = {
447
- str: "TEXT",
448
- np.int64: "INTEGER",
449
- np.float64: "FLOAT",
450
- }
451
- column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
452
-
453
- # create the table
454
- cur.execute(
455
- f"""
456
- CREATE TABLE IF NOT EXISTS results (
457
- {", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
458
- )
459
- """
460
- )
461
- conn.commit()
462
-
463
- # write the results to DB
464
- df.to_sql("results", conn, if_exists="replace", index=False)
465
- conn.commit()
466
-
467
- # read it back using sql
468
- df = pd.read_sql_query(bench_args.graph_sql, conn)
469
- conn.close()
470
-
471
- # plot it and save to a file
472
- import matplotlib.pyplot as plt
473
-
474
- assert (
475
- len(df.columns) == 3
476
- ), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
477
- for label in df[df.columns[0]].unique():
478
- q = f"{df.columns[0]}=='{label}'"
479
- series = df.query(q)
480
- plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
481
- plt.xlabel(df.columns[1])
482
- plt.ylabel(df.columns[2])
483
- plt.legend()
484
- plt.savefig(bench_args.graph_filename, dpi=300)
485
-
486
- # if in kitty, just dump it to the terminal
487
- if os.environ["TERM"] == "xterm-kitty":
488
- os.system(
489
- f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
490
- )
410
+ with open(bench_args.result_filename, "a") as fout:
411
+ for result in result_list:
412
+ fout.write(json.dumps(result) + "\n")
491
413
 
492
414
 
493
415
  def main(server_args, bench_args):
@@ -498,9 +420,6 @@ def main(server_args, bench_args):
498
420
  work_func = correctness_test
499
421
  else:
500
422
  work_func = latency_test
501
- elif os.path.isfile(bench_args.result_filename):
502
- assert bench_args.graph_filename, "please provide a filename for the graph"
503
- work_func = plot_latency_test
504
423
  else:
505
424
  raise ValueError(
506
425
  "Provide --model-path for running the tests or "
@@ -1,10 +1,10 @@
1
1
  """
2
- Benchmark the latency of serving a single batch with a real server.
2
+ Benchmark the latency of running a single batch with a server.
3
+
3
4
  This script launches a server and uses the HTTP interface.
4
- It accepts arguments similar to those of launch_server.py.
5
+ It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
5
6
 
6
7
  Usage:
7
-
8
8
  python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
9
9
 
10
10
  python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8