sglang 0.3.5.post1__tar.gz → 0.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. {sglang-0.3.5.post1 → sglang-0.3.6}/PKG-INFO +5 -5
  2. {sglang-0.3.5.post1 → sglang-0.3.6}/README.md +1 -1
  3. {sglang-0.3.5.post1 → sglang-0.3.6}/pyproject.toml +9 -6
  4. sglang-0.3.6/sglang/bench_latency.py +1 -0
  5. sglang-0.3.6/sglang/bench_offline_throughput.py +337 -0
  6. sglang-0.3.5.post1/sglang/bench_latency.py → sglang-0.3.6/sglang/bench_one_batch.py +19 -98
  7. sglang-0.3.5.post1/sglang/bench_server_latency.py → sglang-0.3.6/sglang/bench_one_batch_server.py +3 -3
  8. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/bench_serving.py +115 -31
  9. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/check_env.py +3 -6
  10. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/constrained/base_grammar_backend.py +4 -3
  11. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/constrained/outlines_backend.py +39 -26
  12. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/constrained/xgrammar_backend.py +58 -14
  13. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/activation.py +3 -0
  14. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/attention/flashinfer_backend.py +93 -48
  15. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/attention/triton_backend.py +9 -7
  16. sglang-0.3.6/sglang/srt/layers/custom_op_util.py +26 -0
  17. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/fused_moe/fused_moe.py +11 -4
  18. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/fused_moe/patch.py +4 -2
  19. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/layernorm.py +4 -0
  20. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/logits_processor.py +10 -10
  21. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/sampler.py +4 -8
  22. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/torchao_utils.py +2 -0
  23. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/data_parallel_controller.py +74 -9
  24. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/detokenizer_manager.py +1 -14
  25. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/io_struct.py +27 -0
  26. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/schedule_batch.py +104 -38
  27. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/schedule_policy.py +5 -1
  28. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/scheduler.py +210 -56
  29. sglang-0.3.6/sglang/srt/managers/session_controller.py +62 -0
  30. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/tokenizer_manager.py +38 -0
  31. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/tp_worker.py +12 -1
  32. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/tp_worker_overlap_thread.py +49 -52
  33. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/model_executor/cuda_graph_runner.py +43 -6
  34. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/model_executor/forward_batch_info.py +109 -15
  35. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/model_executor/model_runner.py +102 -43
  36. sglang-0.3.6/sglang/srt/model_parallel.py +98 -0
  37. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/deepseek_v2.py +147 -44
  38. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/gemma2.py +9 -8
  39. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/llava.py +1 -1
  40. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/llavavid.py +1 -1
  41. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/olmo.py +3 -3
  42. sglang-0.3.6/sglang/srt/models/phi3_small.py +447 -0
  43. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/qwen2_vl.py +13 -6
  44. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/torch_native_llama.py +94 -78
  45. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/openai_api/adapter.py +11 -4
  46. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/openai_api/protocol.py +30 -27
  47. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
  48. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
  49. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
  50. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
  51. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
  52. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/sampling/sampling_batch_info.py +58 -57
  53. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/server.py +29 -2
  54. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/server_args.py +97 -60
  55. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/utils.py +103 -51
  56. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/runners.py +25 -6
  57. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/srt/sampling/penaltylib/utils.py +23 -21
  58. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/test_utils.py +33 -22
  59. sglang-0.3.6/sglang/version.py +1 -0
  60. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang.egg-info/PKG-INFO +5 -5
  61. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang.egg-info/SOURCES.txt +7 -1
  62. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang.egg-info/requires.txt +3 -3
  63. sglang-0.3.5.post1/sglang/version.py +0 -1
  64. {sglang-0.3.5.post1 → sglang-0.3.6}/LICENSE +0 -0
  65. {sglang-0.3.5.post1 → sglang-0.3.6}/setup.cfg +0 -0
  66. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/__init__.py +0 -0
  67. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/api.py +0 -0
  68. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/global_config.py +0 -0
  69. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/__init__.py +0 -0
  70. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/backend/__init__.py +0 -0
  71. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/backend/anthropic.py +0 -0
  72. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/backend/base_backend.py +0 -0
  73. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/backend/litellm.py +0 -0
  74. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/backend/openai.py +0 -0
  75. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/backend/runtime_endpoint.py +0 -0
  76. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/backend/vertexai.py +0 -0
  77. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/chat_template.py +0 -0
  78. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/choices.py +0 -0
  79. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/compiler.py +0 -0
  80. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/interpreter.py +0 -0
  81. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/ir.py +0 -0
  82. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/lang/tracer.py +0 -0
  83. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/launch_server.py +0 -0
  84. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/launch_server_llavavid.py +0 -0
  85. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/configs/__init__.py +0 -0
  86. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/configs/exaone.py +0 -0
  87. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/configs/model_config.py +0 -0
  88. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/configs/qwen2vl.py +0 -0
  89. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/constrained/__init__.py +0 -0
  90. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  91. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/conversation.py +0 -0
  92. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/hf_transformers_utils.py +0 -0
  93. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/attention/__init__.py +0 -0
  94. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  95. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  96. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  97. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  98. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  99. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  100. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/fused_moe/layer.py +0 -0
  101. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/linear.py +0 -0
  102. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/pooler.py +0 -0
  103. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/quantization/__init__.py +0 -0
  104. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/quantization/base_config.py +0 -0
  105. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/radix_attention.py +0 -0
  106. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/rotary_embedding.py +0 -0
  107. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  108. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/lora/lora.py +0 -0
  109. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/lora/lora_config.py +0 -0
  110. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/lora/lora_manager.py +0 -0
  111. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/managers/image_processor.py +0 -0
  112. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  113. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  114. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/mem_cache/flush_cache.py +0 -0
  115. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/mem_cache/memory_pool.py +0 -0
  116. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/mem_cache/radix_cache.py +0 -0
  117. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/metrics/collector.py +0 -0
  118. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/metrics/func_timer.py +0 -0
  119. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/mm_utils.py +0 -0
  120. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/baichuan.py +0 -0
  121. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/chatglm.py +0 -0
  122. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/commandr.py +0 -0
  123. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/dbrx.py +0 -0
  124. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/deepseek.py +0 -0
  125. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/exaone.py +0 -0
  126. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/gemma.py +0 -0
  127. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/gemma2_reward.py +0 -0
  128. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/gpt2.py +0 -0
  129. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/gpt_bigcode.py +0 -0
  130. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/grok.py +0 -0
  131. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/internlm2.py +0 -0
  132. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/internlm2_reward.py +0 -0
  133. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/llama.py +0 -0
  134. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/llama_classification.py +0 -0
  135. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/llama_embedding.py +0 -0
  136. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/llama_reward.py +0 -0
  137. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/minicpm.py +0 -0
  138. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/minicpm3.py +0 -0
  139. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/mistral.py +0 -0
  140. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/mixtral.py +0 -0
  141. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/mixtral_quant.py +0 -0
  142. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/mllama.py +0 -0
  143. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/olmoe.py +0 -0
  144. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/qwen.py +0 -0
  145. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/qwen2.py +0 -0
  146. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/qwen2_moe.py +0 -0
  147. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/stablelm.py +0 -0
  148. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/xverse.py +0 -0
  149. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/xverse_moe.py +0 -0
  150. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/models/yivl.py +0 -0
  151. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  152. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/srt/sampling/sampling_params.py +3 -3
  153. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/few_shot_gsm8k.py +0 -0
  154. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  155. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/run_eval.py +0 -0
  156. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/simple_eval_common.py +0 -0
  157. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/simple_eval_gpqa.py +0 -0
  158. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/simple_eval_humaneval.py +0 -0
  159. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/simple_eval_math.py +0 -0
  160. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/simple_eval_mgsm.py +0 -0
  161. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/simple_eval_mmlu.py +0 -0
  162. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/test_activation.py +0 -0
  163. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/test_layernorm.py +0 -0
  164. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/test/test_programs.py +0 -0
  165. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang/utils.py +0 -0
  166. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang.egg-info/dependency_links.txt +0 -0
  167. {sglang-0.3.5.post1 → sglang-0.3.6}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.5.post1
3
+ Version: 0.3.6
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -223,22 +223,22 @@ Requires-Dist: hf_transfer; extra == "runtime-common"
223
223
  Requires-Dist: huggingface_hub; extra == "runtime-common"
224
224
  Requires-Dist: interegular; extra == "runtime-common"
225
225
  Requires-Dist: orjson; extra == "runtime-common"
226
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
226
227
  Requires-Dist: packaging; extra == "runtime-common"
227
228
  Requires-Dist: pillow; extra == "runtime-common"
228
229
  Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
229
230
  Requires-Dist: psutil; extra == "runtime-common"
230
231
  Requires-Dist: pydantic; extra == "runtime-common"
231
232
  Requires-Dist: python-multipart; extra == "runtime-common"
233
+ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
232
234
  Requires-Dist: torchao; extra == "runtime-common"
233
235
  Requires-Dist: uvicorn; extra == "runtime-common"
234
236
  Requires-Dist: uvloop; extra == "runtime-common"
235
- Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
236
- Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
237
237
  Requires-Dist: modelscope; extra == "runtime-common"
238
238
  Provides-Extra: srt
239
239
  Requires-Dist: sglang[runtime_common]; extra == "srt"
240
240
  Requires-Dist: torch; extra == "srt"
241
- Requires-Dist: vllm==0.6.3.post1; extra == "srt"
241
+ Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
242
242
  Provides-Extra: srt-hip
243
243
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
244
244
  Requires-Dist: torch; extra == "srt-hip"
@@ -323,7 +323,7 @@ The core features include:
323
323
 
324
324
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
325
325
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
326
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
326
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
327
327
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
328
328
 
329
329
  ## Getting Started
@@ -37,7 +37,7 @@ The core features include:
37
37
 
38
38
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
39
39
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
40
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
40
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
41
41
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
42
42
 
43
43
  ## Getting Started
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.5.post1"
7
+ version = "0.3.6"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -16,11 +16,14 @@ classifiers = [
16
16
  dependencies = ["requests", "tqdm", "numpy", "IPython"]
17
17
 
18
18
  [project.optional-dependencies]
19
- runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
20
- "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
21
- "torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
22
- "outlines>=0.0.44", "modelscope"]
23
- srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
19
+ runtime_common = ["aiohttp", "decord", "fastapi",
20
+ "hf_transfer", "huggingface_hub", "interegular",
21
+ "orjson", "outlines>=0.0.44,<0.1.0",
22
+ "packaging", "pillow", "prometheus-client>=0.20.0",
23
+ "psutil", "pydantic", "python-multipart",
24
+ "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
25
+ "modelscope"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
24
27
 
25
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
26
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -0,0 +1 @@
1
+ raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")
@@ -0,0 +1,337 @@
1
+ """
2
+ Benchmark the throughput in the offline mode.
3
+ It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
4
+
5
+ # Usage
6
+ ## Sharegpt dataset with default args
7
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
8
+
9
+ ## Random dataset with default args
10
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024
11
+ """
12
+
13
+ import argparse
14
+ import dataclasses
15
+ import json
16
+ import logging
17
+ import random
18
+ import time
19
+ from typing import Dict, List, Optional, Tuple
20
+
21
+ import numpy as np
22
+
23
+ from sglang.api import Engine
24
+ from sglang.bench_serving import (
25
+ get_dataset,
26
+ get_tokenizer,
27
+ sample_random_requests,
28
+ set_ulimit,
29
+ )
30
+ from sglang.srt.server import Runtime
31
+ from sglang.srt.server_args import ServerArgs
32
+
33
+
34
+ @dataclasses.dataclass
35
+ class BenchArgs:
36
+ backend: str = "engine"
37
+ result_filename: str = ""
38
+ dataset_name: str = "sharegpt"
39
+ dataset_path: str = ""
40
+ num_prompts: int = 1000
41
+ sharegpt_output_len: Optional[int] = None
42
+ random_input_len: int = 1024
43
+ random_output_len: int = 1024
44
+ random_range_ratio: float = 0.0
45
+ gen_num_groups: int = 64
46
+ gen_prompts_per_group: int = 16
47
+ gen_system_prompt_len: int = 2048
48
+ gen_question_len: int = 128
49
+ gen_output_len: int = 256
50
+ disable_ignore_eos: bool = False
51
+ extra_request_body: Optional[str] = None
52
+ seed: int = 1
53
+ skip_warmup: bool = False
54
+ do_not_exit: bool = False
55
+
56
+ @staticmethod
57
+ def add_cli_args(parser: argparse.ArgumentParser):
58
+ parser.add_argument("--backend", type=str, default=BenchArgs.backend)
59
+ parser.add_argument(
60
+ "--result-filename", type=str, default=BenchArgs.result_filename
61
+ )
62
+ parser.add_argument(
63
+ "--dataset-name",
64
+ type=str,
65
+ default="sharegpt",
66
+ choices=["sharegpt", "random", "generated-shared-prefix"],
67
+ help="Name of the dataset to benchmark on.",
68
+ )
69
+ parser.add_argument(
70
+ "--dataset-path", type=str, default="", help="Path to the dataset."
71
+ )
72
+ parser.add_argument(
73
+ "--num-prompts",
74
+ type=int,
75
+ default=BenchArgs.num_prompts,
76
+ help="Number of prompts to process. Default is 1000.",
77
+ )
78
+ parser.add_argument(
79
+ "--sharegpt-output-len",
80
+ type=int,
81
+ default=BenchArgs.sharegpt_output_len,
82
+ help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
83
+ )
84
+ parser.add_argument(
85
+ "--random-input-len",
86
+ type=int,
87
+ default=BenchArgs.random_input_len,
88
+ help="Number of input tokens per request, used only for random dataset.",
89
+ )
90
+ parser.add_argument(
91
+ "--random-output-len",
92
+ type=int,
93
+ default=BenchArgs.random_output_len,
94
+ help="Number of output tokens per request, used only for random dataset.",
95
+ )
96
+ parser.add_argument(
97
+ "--random-range-ratio",
98
+ type=float,
99
+ default=BenchArgs.random_range_ratio,
100
+ help="Range of sampled ratio of input/output length, "
101
+ "used only for random dataset.",
102
+ )
103
+ parser.add_argument(
104
+ "--gen-num-groups",
105
+ type=int,
106
+ default=BenchArgs.gen_num_groups,
107
+ help="Number of groups with shared prefix, used"
108
+ "only for generate-shared-prefix",
109
+ )
110
+ parser.add_argument(
111
+ "--gen-prompts-per-group",
112
+ type=int,
113
+ default=BenchArgs.gen_prompts_per_group,
114
+ help="Number of prompts per group of shared prefix, used"
115
+ "only for generate-shared-prefix",
116
+ )
117
+ parser.add_argument(
118
+ "--gen-system-prompt-len",
119
+ type=int,
120
+ default=BenchArgs.gen_system_prompt_len,
121
+ help="System prompt length, used" "only for generate-shared-prefix",
122
+ )
123
+ parser.add_argument(
124
+ "--gen-question-len",
125
+ type=int,
126
+ default=BenchArgs.gen_question_len,
127
+ help="Question length, used" "only for generate-shared-prefix",
128
+ )
129
+ parser.add_argument(
130
+ "--gen-output-len",
131
+ type=int,
132
+ default=BenchArgs.gen_output_len,
133
+ help="Target length in tokens for outputs in generated-shared-prefix dataset",
134
+ )
135
+ parser.add_argument(
136
+ "--disable-ignore-eos",
137
+ type=bool,
138
+ default=BenchArgs.disable_ignore_eos,
139
+ help="Disable ignore EOS token",
140
+ )
141
+ parser.add_argument(
142
+ "--extra-request-body",
143
+ metavar='{"key1": "value1", "key2": "value2"}',
144
+ type=str,
145
+ help="Append given JSON object to the request payload. You can use this to specify"
146
+ "additional generate params like sampling params.",
147
+ )
148
+ parser.add_argument("--seed", type=int, default=1, help="The random seed.")
149
+ parser.add_argument(
150
+ "--skip-warmup",
151
+ action="store_true",
152
+ help="Skip the warmup batches.",
153
+ )
154
+ parser.add_argument(
155
+ "--do-not-exit",
156
+ action="store_true",
157
+ help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
158
+ )
159
+
160
+ @classmethod
161
+ def from_cli_args(cls, args: argparse.Namespace):
162
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
163
+ return cls(**{attr: getattr(args, attr) for attr in attrs})
164
+
165
+
166
+ def throughput_test_once(
167
+ backend_name: str,
168
+ backend,
169
+ reqs: List[Tuple[str, int, int]],
170
+ ignore_eos: bool,
171
+ extra_request_body: Dict,
172
+ ):
173
+ measurement_results = {
174
+ "backend": backend_name,
175
+ "successful_requests": len(reqs),
176
+ "total_latency": -1,
177
+ "total_input_tokens": sum(r[1] for r in reqs),
178
+ "total_output_tokens": -1,
179
+ "request_throughput": -1,
180
+ "input_throughput": -1,
181
+ "output_throughput": -1,
182
+ "total_throughput": -1,
183
+ }
184
+
185
+ prompt = [r[0] for r in reqs]
186
+ sampling_params = [
187
+ {
188
+ "temperature": 0,
189
+ "max_new_tokens": r[2],
190
+ "ignore_eos": ignore_eos,
191
+ **extra_request_body,
192
+ }
193
+ for r in reqs
194
+ ]
195
+
196
+ st = time.perf_counter()
197
+ gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
198
+ latency = time.perf_counter() - st
199
+
200
+ if backend_name == "runtime":
201
+ gen_out = json.loads(gen_out)
202
+
203
+ measurement_results["total_latency"] = latency
204
+ measurement_results["total_output_tokens"] = sum(
205
+ o["meta_info"]["completion_tokens"] for o in gen_out
206
+ )
207
+ measurement_results["request_throughput"] = (
208
+ measurement_results["successful_requests"] / latency
209
+ )
210
+ measurement_results["input_throughput"] = (
211
+ measurement_results["total_input_tokens"] / latency
212
+ )
213
+ measurement_results["output_throughput"] = (
214
+ measurement_results["total_output_tokens"] / latency
215
+ )
216
+ measurement_results["total_throughput"] = (
217
+ measurement_results["total_input_tokens"]
218
+ + measurement_results["total_output_tokens"]
219
+ ) / latency
220
+
221
+ return measurement_results
222
+
223
+
224
+ def throughput_test(
225
+ server_args: ServerArgs,
226
+ bench_args: BenchArgs,
227
+ ):
228
+ if bench_args.backend == "engine":
229
+ backend = Engine(**dataclasses.asdict(server_args))
230
+ if not backend:
231
+ raise ValueError("Please provide valid engine arguments")
232
+ elif bench_args.backend == "runtime":
233
+ backend = Runtime(**dataclasses.asdict(server_args))
234
+ else:
235
+ raise ValueError('Please set backend to either "engine" or "runtime"')
236
+
237
+ tokenizer_id = server_args.model_path
238
+ tokenizer = get_tokenizer(tokenizer_id)
239
+
240
+ # Set global environmnets
241
+ set_ulimit()
242
+ random.seed(bench_args.seed)
243
+ np.random.seed(bench_args.seed)
244
+
245
+ # Parse args
246
+ extra_request_body = {}
247
+ if bench_args.extra_request_body:
248
+ extra_request_body = json.loads(args.extra_request_body)
249
+
250
+ # Read dataset
251
+ input_requests = get_dataset(bench_args, tokenizer)
252
+
253
+ warmup_requests = sample_random_requests(
254
+ input_len=256,
255
+ output_len=16,
256
+ num_prompts=16,
257
+ range_ratio=0.8,
258
+ tokenizer=tokenizer,
259
+ dataset_path=bench_args.dataset_path,
260
+ )
261
+
262
+ # Warm up
263
+ if not bench_args.skip_warmup:
264
+ logging.info("\nWarmup...")
265
+ throughput_test_once(
266
+ backend_name=bench_args.backend,
267
+ backend=backend,
268
+ reqs=warmup_requests,
269
+ ignore_eos=not bench_args.disable_ignore_eos,
270
+ extra_request_body=extra_request_body,
271
+ )
272
+
273
+ logging.info("\nBenchmark...")
274
+ result = throughput_test_once(
275
+ backend_name=bench_args.backend,
276
+ backend=backend,
277
+ reqs=input_requests,
278
+ ignore_eos=not bench_args.disable_ignore_eos,
279
+ extra_request_body=extra_request_body,
280
+ )
281
+
282
+ if bench_args.result_filename:
283
+ with open(bench_args.result_filename, "a") as fout:
284
+ fout.write(json.dumps(result) + "\n")
285
+
286
+ print(
287
+ "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
288
+ )
289
+ print("{:<40} {:<10}".format("Backend:", result["backend"]))
290
+ print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"]))
291
+ print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"]))
292
+ print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
293
+ print(
294
+ "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
295
+ )
296
+ print(
297
+ "{:<40} {:<10.2f}".format(
298
+ "Request throughput (req/s):", result["request_throughput"]
299
+ )
300
+ )
301
+ print(
302
+ "{:<40} {:<10.2f}".format(
303
+ "Input token throughput (tok/s):", result["input_throughput"]
304
+ )
305
+ )
306
+ print(
307
+ "{:<40} {:<10.2f}".format(
308
+ "Output token throughput (tok/s):", result["output_throughput"]
309
+ )
310
+ )
311
+ print(
312
+ "{:<40} {:<10.2f}".format(
313
+ "Total token throughput (tok/s):", result["total_throughput"]
314
+ )
315
+ )
316
+ print("=" * 50)
317
+
318
+ return result
319
+
320
+
321
+ if __name__ == "__main__":
322
+ parser = argparse.ArgumentParser()
323
+ ServerArgs.add_cli_args(parser)
324
+ BenchArgs.add_cli_args(parser)
325
+ args = parser.parse_args()
326
+ server_args = ServerArgs.from_cli_args(args)
327
+ bench_args = BenchArgs.from_cli_args(args)
328
+
329
+ logging.basicConfig(
330
+ level=getattr(logging, server_args.log_level.upper()),
331
+ format="%(message)s",
332
+ )
333
+
334
+ throughput_test(server_args, bench_args)
335
+
336
+ while bench_args.do_not_exit:
337
+ pass
@@ -1,20 +1,17 @@
1
1
  """
2
- Benchmark the latency of running a single static batch.
2
+ Benchmark the latency of running a single static batch without a server.
3
+
3
4
  This script does not launch a server and uses the low-level APIs.
4
- It accepts arguments similar to those of launch_server.py.
5
+ It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
5
6
 
6
7
  # Usage (latency test)
7
8
  ## with dummy weights:
8
- python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
9
+ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
9
10
  ## sweep through multiple data points and store (append) the results in a jsonl file:
10
- python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
11
- ## do some changes, and store the results under a different run_name:
12
- python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
13
- ## plot the results in series of lines:
14
- python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
11
+ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
15
12
 
16
13
  # Usage (correctness test):
17
- python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
14
+ python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
18
15
 
19
16
  ## Reference output (of the correctness test above, can be gpu dependent):
20
17
  input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
@@ -50,13 +47,10 @@ import itertools
50
47
  import json
51
48
  import logging
52
49
  import multiprocessing
53
- import os
54
- import sqlite3
55
50
  import time
56
51
  from typing import Tuple
57
52
 
58
53
  import numpy as np
59
- import pandas as pd
60
54
  import torch
61
55
  import torch.distributed as dist
62
56
 
@@ -77,19 +71,14 @@ from sglang.srt.utils import (
77
71
 
78
72
  @dataclasses.dataclass
79
73
  class BenchArgs:
80
- run_name: str = "before"
74
+ run_name: str = "default"
81
75
  batch_size: Tuple[int] = (1,)
82
76
  input_len: Tuple[int] = (1024,)
83
77
  output_len: Tuple[int] = (16,)
84
- result_filename: str = ""
78
+ result_filename: str = "result.jsonl"
85
79
  correctness_test: bool = False
86
80
  # This is only used for correctness test
87
81
  cut_len: int = 4
88
- # Plotting args
89
- graph_sql: str = (
90
- "select run_name, batch_size, prefill_throughput from results where run_name='before'"
91
- )
92
- graph_filename: str = "out.png"
93
82
 
94
83
  @staticmethod
95
84
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -108,11 +97,6 @@ class BenchArgs:
108
97
  )
109
98
  parser.add_argument("--correctness-test", action="store_true")
110
99
  parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
111
- # graphing
112
- parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
113
- parser.add_argument(
114
- "--graph-filename", type=str, default=BenchArgs.graph_filename
115
- )
116
100
 
117
101
  @classmethod
118
102
  def from_cli_args(cls, args: argparse.Namespace):
@@ -220,7 +204,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
220
204
  return reqs
221
205
 
222
206
 
223
- @torch.inference_mode()
207
+ @torch.no_grad
224
208
  def extend(reqs, model_runner):
225
209
  batch = ScheduleBatch.init_new(
226
210
  reqs=reqs,
@@ -237,7 +221,7 @@ def extend(reqs, model_runner):
237
221
  return next_token_ids, logits_output.next_token_logits, batch
238
222
 
239
223
 
240
- @torch.inference_mode()
224
+ @torch.no_grad
241
225
  def decode(input_token_ids, batch, model_runner):
242
226
  batch.output_ids = input_token_ids
243
227
  batch.prepare_for_decode()
@@ -254,6 +238,7 @@ def correctness_test(
254
238
  bench_args,
255
239
  tp_rank,
256
240
  ):
241
+ # Configure the logger
257
242
  configure_logger(server_args, prefix=f" TP{tp_rank}")
258
243
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
259
244
 
@@ -274,7 +259,7 @@ def correctness_test(
274
259
  bench_args, input_ids, reqs, model_runner
275
260
  )
276
261
 
277
- # Extend
262
+ # Extend (prefill w/ KV cache)
278
263
  next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
279
264
  rank_print(f"prefill logits (final): {next_token_logits} \n")
280
265
 
@@ -286,7 +271,7 @@ def correctness_test(
286
271
  for i in range(len(reqs)):
287
272
  output_ids[i].append(next_token_ids_list[i])
288
273
 
289
- # Print
274
+ # Print output texts
290
275
  for i in range(len(reqs)):
291
276
  rank_print(f"========== Prompt {i} ==========")
292
277
  rank_print(tokenizer.decode(output_ids[i]), "\n")
@@ -352,7 +337,7 @@ def latency_test_run_once(
352
337
  f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
353
338
  )
354
339
 
355
- # record decode timing from 2nd output
340
+ # Record decode timing from 2nd output
356
341
  if output_len > 1:
357
342
  med_decode_latency = np.median(decode_latencies)
358
343
  med_decode_throughput = batch_size / med_decode_latency
@@ -367,7 +352,7 @@ def latency_test_run_once(
367
352
  f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
368
353
  )
369
354
  measurement_results["total_latency"] = tot_latency
370
- measurement_results["total_throughput"] = throughput
355
+ measurement_results["overall_throughput"] = throughput
371
356
  return measurement_results
372
357
 
373
358
 
@@ -377,6 +362,7 @@ def latency_test(
377
362
  bench_args,
378
363
  tp_rank,
379
364
  ):
365
+ # Configure the logger
380
366
  configure_logger(server_args, prefix=f" TP{tp_rank}")
381
367
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
382
368
 
@@ -423,71 +409,9 @@ def latency_test(
423
409
 
424
410
  # Write results in jsonlines format on rank 0.
425
411
  if tp_rank == 0 and bench_args.result_filename:
426
- import jsonlines
427
-
428
- with jsonlines.open(bench_args.result_filename, "a") as f:
429
- f.write_all(result_list)
430
-
431
-
432
- def plot_latency_test(
433
- server_args,
434
- bench_args,
435
- tp_rank,
436
- ):
437
- assert tp_rank == 0
438
-
439
- # read the jsonl file and put in sqlite
440
- df = pd.read_json(bench_args.result_filename, lines=True)
441
- conn = sqlite3.connect(":memory:")
442
- cur = conn.cursor()
443
-
444
- # get the columns and their types
445
- column_names = list(df.iloc[0].keys())
446
- type_dict = {
447
- str: "TEXT",
448
- np.int64: "INTEGER",
449
- np.float64: "FLOAT",
450
- }
451
- column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
452
-
453
- # create the table
454
- cur.execute(
455
- f"""
456
- CREATE TABLE IF NOT EXISTS results (
457
- {", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
458
- )
459
- """
460
- )
461
- conn.commit()
462
-
463
- # write the results to DB
464
- df.to_sql("results", conn, if_exists="replace", index=False)
465
- conn.commit()
466
-
467
- # read it back using sql
468
- df = pd.read_sql_query(bench_args.graph_sql, conn)
469
- conn.close()
470
-
471
- # plot it and save to a file
472
- import matplotlib.pyplot as plt
473
-
474
- assert (
475
- len(df.columns) == 3
476
- ), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
477
- for label in df[df.columns[0]].unique():
478
- q = f"{df.columns[0]}=='{label}'"
479
- series = df.query(q)
480
- plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
481
- plt.xlabel(df.columns[1])
482
- plt.ylabel(df.columns[2])
483
- plt.legend()
484
- plt.savefig(bench_args.graph_filename, dpi=300)
485
-
486
- # if in kitty, just dump it to the terminal
487
- if os.environ["TERM"] == "xterm-kitty":
488
- os.system(
489
- f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
490
- )
412
+ with open(bench_args.result_filename, "a") as fout:
413
+ for result in result_list:
414
+ fout.write(json.dumps(result) + "\n")
491
415
 
492
416
 
493
417
  def main(server_args, bench_args):
@@ -498,9 +422,6 @@ def main(server_args, bench_args):
498
422
  work_func = correctness_test
499
423
  else:
500
424
  work_func = latency_test
501
- elif os.path.isfile(bench_args.result_filename):
502
- assert bench_args.graph_filename, "please provide a filename for the graph"
503
- work_func = plot_latency_test
504
425
  else:
505
426
  raise ValueError(
506
427
  "Provide --model-path for running the tests or "
@@ -1,10 +1,10 @@
1
1
  """
2
- Benchmark the latency of serving a single batch with a real server.
2
+ Benchmark the latency of running a single batch with a server.
3
+
3
4
  This script launches a server and uses the HTTP interface.
4
- It accepts arguments similar to those of launch_server.py.
5
+ It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
5
6
 
6
7
  Usage:
7
-
8
8
  python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
9
9
 
10
10
  python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8