sglang 0.3.3.post1__tar.gz → 0.3.4.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. {sglang-0.3.3.post1/sglang.egg-info → sglang-0.3.4.post1}/PKG-INFO +75 -32
  2. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/README.md +43 -13
  3. sglang-0.3.4.post1/pyproject.toml +68 -0
  4. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/bench_latency.py +30 -11
  5. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/bench_server_latency.py +21 -10
  6. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/bench_serving.py +101 -7
  7. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/global_config.py +0 -1
  8. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/chat_template.py +17 -0
  9. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/launch_server_llavavid.py +1 -1
  10. sglang-0.3.4.post1/sglang/srt/configs/__init__.py +8 -0
  11. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/configs/model_config.py +2 -0
  12. sglang-0.3.4.post1/sglang/srt/configs/qwen2vl.py +133 -0
  13. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/conversation.py +27 -0
  14. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/hf_transformers_utils.py +2 -1
  15. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/__init__.py +38 -5
  16. sglang-0.3.4.post1/sglang/srt/layers/attention/double_sparsity_backend.py +297 -0
  17. sglang-0.3.4.post1/sglang/srt/layers/attention/flashinfer_backend.py +666 -0
  18. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_backend.py +26 -8
  19. sglang-0.3.4.post1/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
  20. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
  21. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +30 -6
  22. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/linear.py +89 -63
  23. sglang-0.3.4.post1/sglang/srt/layers/rotary_embedding.py +145 -0
  24. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/sampler.py +6 -2
  25. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/lora/lora.py +3 -1
  26. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/detokenizer_manager.py +31 -10
  27. sglang-0.3.4.post1/sglang/srt/managers/image_processor.py +360 -0
  28. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/io_struct.py +4 -0
  29. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/schedule_batch.py +319 -82
  30. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/schedule_policy.py +2 -1
  31. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/scheduler.py +233 -158
  32. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/tokenizer_manager.py +15 -5
  33. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/tp_worker.py +30 -5
  34. sglang-0.3.4.post1/sglang/srt/managers/tp_worker_overlap_thread.py +172 -0
  35. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/chunk_cache.py +8 -4
  36. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/memory_pool.py +123 -11
  37. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/radix_cache.py +19 -10
  38. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/model_executor/cuda_graph_runner.py +63 -12
  39. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/model_executor/forward_batch_info.py +101 -23
  40. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/model_executor/model_runner.py +92 -12
  41. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/baichuan.py +2 -3
  42. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/chatglm.py +8 -9
  43. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/commandr.py +1 -2
  44. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/dbrx.py +1 -2
  45. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/deepseek.py +4 -5
  46. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/deepseek_v2.py +7 -8
  47. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/exaone.py +1 -2
  48. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/gemma.py +2 -2
  49. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/gemma2.py +5 -5
  50. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/gpt_bigcode.py +5 -5
  51. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/grok.py +1 -2
  52. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/internlm2.py +1 -2
  53. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llama.py +1 -2
  54. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llama_classification.py +1 -2
  55. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llama_reward.py +2 -3
  56. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llava.py +4 -8
  57. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llavavid.py +1 -2
  58. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/minicpm.py +1 -2
  59. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/minicpm3.py +5 -6
  60. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/mixtral.py +1 -2
  61. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/mixtral_quant.py +1 -2
  62. sglang-0.3.4.post1/sglang/srt/models/mllama.py +1004 -0
  63. sglang-0.3.4.post1/sglang/srt/models/olmo.py +352 -0
  64. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/olmoe.py +1 -2
  65. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/qwen.py +1 -2
  66. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/qwen2.py +1 -2
  67. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/qwen2_moe.py +4 -5
  68. sglang-0.3.4.post1/sglang/srt/models/qwen2_vl.py +724 -0
  69. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/stablelm.py +1 -2
  70. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/torch_native_llama.py +1 -2
  71. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/xverse.py +1 -2
  72. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/xverse_moe.py +4 -5
  73. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/yivl.py +1 -2
  74. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/openai_api/adapter.py +92 -49
  75. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/openai_api/protocol.py +10 -2
  76. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
  77. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/sampling_batch_info.py +103 -59
  78. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/sampling_params.py +2 -0
  79. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/server.py +116 -17
  80. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/server_args.py +131 -45
  81. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/utils.py +33 -3
  82. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/few_shot_gsm8k.py +4 -1
  83. sglang-0.3.4.post1/sglang/test/few_shot_gsm8k_engine.py +144 -0
  84. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/runners.py +20 -1
  85. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/srt/sampling/penaltylib/utils.py +16 -12
  86. sglang-0.3.4.post1/sglang/version.py +1 -0
  87. {sglang-0.3.3.post1 → sglang-0.3.4.post1/sglang.egg-info}/PKG-INFO +75 -32
  88. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang.egg-info/SOURCES.txt +9 -1
  89. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang.egg-info/requires.txt +20 -3
  90. sglang-0.3.3.post1/pyproject.toml +0 -42
  91. sglang-0.3.3.post1/sglang/srt/configs/__init__.py +0 -5
  92. sglang-0.3.3.post1/sglang/srt/layers/attention/flashinfer_backend.py +0 -277
  93. sglang-0.3.3.post1/sglang/srt/layers/attention/flashinfer_utils.py +0 -237
  94. sglang-0.3.3.post1/sglang/srt/managers/image_processor.py +0 -187
  95. sglang-0.3.3.post1/sglang/version.py +0 -1
  96. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/LICENSE +0 -0
  97. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/setup.cfg +0 -0
  98. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/__init__.py +0 -0
  99. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/api.py +0 -0
  100. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/check_env.py +0 -0
  101. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/__init__.py +0 -0
  102. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/__init__.py +0 -0
  103. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/anthropic.py +0 -0
  104. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/base_backend.py +0 -0
  105. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/litellm.py +0 -0
  106. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/openai.py +0 -0
  107. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  108. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/vertexai.py +0 -0
  109. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/choices.py +0 -0
  110. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/compiler.py +0 -0
  111. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/interpreter.py +0 -0
  112. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/ir.py +0 -0
  113. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/tracer.py +0 -0
  114. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/launch_server.py +0 -0
  115. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/configs/exaone.py +0 -0
  116. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/constrained/__init__.py +0 -0
  117. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/constrained/base_tool_cache.py +0 -0
  118. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/constrained/fsm_cache.py +0 -0
  119. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/constrained/jump_forward.py +0 -0
  120. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/activation.py +0 -0
  121. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  122. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  123. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  124. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/layer.py +0 -0
  125. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/patch.py +0 -0
  126. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/layernorm.py +0 -0
  127. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/logits_processor.py +0 -0
  128. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/pooler.py +0 -0
  129. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
  130. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  131. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/radix_attention.py +0 -0
  132. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  133. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/lora/lora_config.py +0 -0
  134. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/lora/lora_manager.py +0 -0
  135. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  136. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  137. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  138. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mm_utils.py +0 -0
  139. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llama_embedding.py +0 -0
  140. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/mistral.py +0 -0
  141. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  142. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  143. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  144. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  145. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  146. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/run_eval.py +0 -0
  147. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_common.py +0 -0
  148. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  149. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  150. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_math.py +0 -0
  151. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  152. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  153. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/test_activation.py +0 -0
  154. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/test_layernorm.py +0 -0
  155. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/test_programs.py +0 -0
  156. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/test_utils.py +0 -0
  157. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/utils.py +0 -0
  158. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang.egg-info/dependency_links.txt +0 -0
  159. {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.3.post1
3
+ Version: 0.3.4.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -214,26 +214,31 @@ License-File: LICENSE
214
214
  Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
216
  Requires-Dist: numpy
217
+ Provides-Extra: runtime-common
218
+ Requires-Dist: aiohttp; extra == "runtime-common"
219
+ Requires-Dist: decord; extra == "runtime-common"
220
+ Requires-Dist: fastapi; extra == "runtime-common"
221
+ Requires-Dist: hf_transfer; extra == "runtime-common"
222
+ Requires-Dist: huggingface_hub; extra == "runtime-common"
223
+ Requires-Dist: interegular; extra == "runtime-common"
224
+ Requires-Dist: orjson; extra == "runtime-common"
225
+ Requires-Dist: packaging; extra == "runtime-common"
226
+ Requires-Dist: pillow; extra == "runtime-common"
227
+ Requires-Dist: psutil; extra == "runtime-common"
228
+ Requires-Dist: pydantic; extra == "runtime-common"
229
+ Requires-Dist: python-multipart; extra == "runtime-common"
230
+ Requires-Dist: torchao; extra == "runtime-common"
231
+ Requires-Dist: uvicorn; extra == "runtime-common"
232
+ Requires-Dist: uvloop; extra == "runtime-common"
233
+ Requires-Dist: zmq; extra == "runtime-common"
234
+ Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
235
+ Requires-Dist: modelscope; extra == "runtime-common"
217
236
  Provides-Extra: srt
218
- Requires-Dist: aiohttp; extra == "srt"
219
- Requires-Dist: decord; extra == "srt"
220
- Requires-Dist: fastapi; extra == "srt"
221
- Requires-Dist: hf_transfer; extra == "srt"
222
- Requires-Dist: huggingface_hub; extra == "srt"
223
- Requires-Dist: interegular; extra == "srt"
224
- Requires-Dist: packaging; extra == "srt"
225
- Requires-Dist: pillow; extra == "srt"
226
- Requires-Dist: psutil; extra == "srt"
227
- Requires-Dist: pydantic; extra == "srt"
228
- Requires-Dist: python-multipart; extra == "srt"
237
+ Requires-Dist: sglang[runtime_common]; extra == "srt"
229
238
  Requires-Dist: torch; extra == "srt"
230
- Requires-Dist: torchao; extra == "srt"
231
- Requires-Dist: uvicorn; extra == "srt"
232
- Requires-Dist: uvloop; extra == "srt"
233
- Requires-Dist: zmq; extra == "srt"
234
- Requires-Dist: vllm==0.5.5; extra == "srt"
235
- Requires-Dist: outlines>=0.0.44; extra == "srt"
236
- Requires-Dist: modelscope; extra == "srt"
239
+ Requires-Dist: vllm==0.6.3.post1; extra == "srt"
240
+ Provides-Extra: srt-xpu
241
+ Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
237
242
  Provides-Extra: openai
238
243
  Requires-Dist: openai>=1.0; extra == "openai"
239
244
  Requires-Dist: tiktoken; extra == "openai"
@@ -253,9 +258,17 @@ Requires-Dist: sglang[srt]; extra == "all"
253
258
  Requires-Dist: sglang[openai]; extra == "all"
254
259
  Requires-Dist: sglang[anthropic]; extra == "all"
255
260
  Requires-Dist: sglang[litellm]; extra == "all"
261
+ Provides-Extra: all-xpu
262
+ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
263
+ Requires-Dist: sglang[openai]; extra == "all-xpu"
264
+ Requires-Dist: sglang[anthropic]; extra == "all-xpu"
265
+ Requires-Dist: sglang[litellm]; extra == "all-xpu"
256
266
  Provides-Extra: dev
257
267
  Requires-Dist: sglang[all]; extra == "dev"
258
268
  Requires-Dist: sglang[test]; extra == "dev"
269
+ Provides-Extra: dev-xpu
270
+ Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
271
+ Requires-Dist: sglang[test]; extra == "dev-xpu"
259
272
 
260
273
  <div align="center" id="sglangtop">
261
274
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -270,19 +283,18 @@ Requires-Dist: sglang[test]; extra == "dev"
270
283
 
271
284
  --------------------------------------------------------------------------------
272
285
 
273
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
274
-
275
- ## Upcoming Events
276
- - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
286
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
287
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
277
288
 
278
289
  ## News
279
- - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
280
- - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
281
- - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
290
+ - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
291
+ - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
292
+ - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
282
293
 
283
294
  <details>
284
295
  <summary>More</summary>
285
296
 
297
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
286
298
  - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
287
299
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
288
300
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -323,7 +335,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
323
335
  ### Method 2: From source
324
336
  ```
325
337
  # Use the last release branch
326
- git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
338
+ git clone -b v0.3.4.post1 https://github.com/sgl-project/sglang.git
327
339
  cd sglang
328
340
 
329
341
  pip install --upgrade pip
@@ -500,6 +512,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
500
512
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
501
513
  ```
502
514
 
515
+ ### Engine Without HTTP Server
516
+
517
+ We also provide an inference engine **without a HTTP server**. For example,
518
+
519
+ ```python
520
+ import sglang as sgl
521
+
522
+
523
+ def main():
524
+ prompts = [
525
+ "Hello, my name is",
526
+ "The president of the United States is",
527
+ "The capital of France is",
528
+ "The future of AI is",
529
+ ]
530
+ sampling_params = {"temperature": 0.8, "top_p": 0.95}
531
+ llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
532
+
533
+ outputs = llm.generate(prompts, sampling_params)
534
+ for prompt, output in zip(prompts, outputs):
535
+ print("===============================")
536
+ print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
537
+
538
+ if __name__ == "__main__":
539
+ main()
540
+ ```
541
+
542
+ This can be used for:
543
+
544
+ 1. **Offline Batch Inference**
545
+ 2. **Building Custom Servers**
546
+
547
+ You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
548
+
503
549
  ### Supported Models
504
550
 
505
551
  **Generative Models**
@@ -529,6 +575,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
529
575
  - MiniCPM / MiniCPM 3
530
576
  - XVERSE / XVERSE MoE
531
577
  - SmolLM
578
+ - GLM-4
532
579
 
533
580
  **Embedding Models**
534
581
 
@@ -836,10 +883,7 @@ def chat_example(s):
836
883
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
837
884
 
838
885
  ## Benchmark And Performance
839
- ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
840
- ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
841
-
842
- Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
886
+ Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
843
887
 
844
888
  ## Roadmap
845
889
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -849,7 +893,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
849
893
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
850
894
 
851
895
 
852
-
853
896
  <p align="center">
854
897
  <a href="#sglangtop" target="_blank">
855
898
  <bold>Back To Top </bold>
@@ -11,19 +11,18 @@
11
11
 
12
12
  --------------------------------------------------------------------------------
13
13
 
14
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
15
-
16
- ## Upcoming Events
17
- - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
14
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
15
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
18
16
 
19
17
  ## News
20
- - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
21
- - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
22
- - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
18
+ - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
19
+ - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
20
+ - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
23
21
 
24
22
  <details>
25
23
  <summary>More</summary>
26
24
 
25
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
27
26
  - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
28
27
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
29
28
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -64,7 +63,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
64
63
  ### Method 2: From source
65
64
  ```
66
65
  # Use the last release branch
67
- git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
66
+ git clone -b v0.3.4.post1 https://github.com/sgl-project/sglang.git
68
67
  cd sglang
69
68
 
70
69
  pip install --upgrade pip
@@ -241,6 +240,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
241
240
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
242
241
  ```
243
242
 
243
+ ### Engine Without HTTP Server
244
+
245
+ We also provide an inference engine **without a HTTP server**. For example,
246
+
247
+ ```python
248
+ import sglang as sgl
249
+
250
+
251
+ def main():
252
+ prompts = [
253
+ "Hello, my name is",
254
+ "The president of the United States is",
255
+ "The capital of France is",
256
+ "The future of AI is",
257
+ ]
258
+ sampling_params = {"temperature": 0.8, "top_p": 0.95}
259
+ llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
260
+
261
+ outputs = llm.generate(prompts, sampling_params)
262
+ for prompt, output in zip(prompts, outputs):
263
+ print("===============================")
264
+ print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
265
+
266
+ if __name__ == "__main__":
267
+ main()
268
+ ```
269
+
270
+ This can be used for:
271
+
272
+ 1. **Offline Batch Inference**
273
+ 2. **Building Custom Servers**
274
+
275
+ You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
276
+
244
277
  ### Supported Models
245
278
 
246
279
  **Generative Models**
@@ -270,6 +303,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
270
303
  - MiniCPM / MiniCPM 3
271
304
  - XVERSE / XVERSE MoE
272
305
  - SmolLM
306
+ - GLM-4
273
307
 
274
308
  **Embedding Models**
275
309
 
@@ -577,10 +611,7 @@ def chat_example(s):
577
611
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
578
612
 
579
613
  ## Benchmark And Performance
580
- ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
581
- ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
582
-
583
- Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
614
+ Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
584
615
 
585
616
  ## Roadmap
586
617
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -590,7 +621,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
590
621
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
591
622
 
592
623
 
593
-
594
624
  <p align="center">
595
625
  <a href="#sglangtop" target="_blank">
596
626
  <bold>Back To Top </bold>
@@ -0,0 +1,68 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sglang"
7
+ version = "0.3.4.post1"
8
+ description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { file = "LICENSE" }
12
+ classifiers = [
13
+ "Programming Language :: Python :: 3",
14
+ "License :: OSI Approved :: Apache Software License",
15
+ ]
16
+ dependencies = ["requests", "tqdm", "numpy"]
17
+
18
+ [project.optional-dependencies]
19
+ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
20
+ "orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
21
+ "torchao", "uvicorn", "uvloop", "zmq",
22
+ "outlines>=0.0.44", "modelscope"]
23
+ # xpu is not enabled in public vllm and torch whl,
24
+ # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
25
+ srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
26
+ srt_xpu = ["sglang[runtime_common]"]
27
+
28
+ openai = ["openai>=1.0", "tiktoken"]
29
+ anthropic = ["anthropic>=0.20.0"]
30
+ litellm = ["litellm>=1.0.0"]
31
+ test = [
32
+ "jsonlines",
33
+ "matplotlib",
34
+ "pandas",
35
+ "sentence_transformers",
36
+ "accelerate",
37
+ "peft",
38
+ ]
39
+ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
40
+ all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
41
+ dev = ["sglang[all]", "sglang[test]"]
42
+ dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
43
+
44
+ [project.urls]
45
+ "Homepage" = "https://github.com/sgl-project/sglang"
46
+ "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
47
+
48
+ [tool.setuptools.packages.find]
49
+ exclude = [
50
+ "assets*",
51
+ "benchmark*",
52
+ "docs*",
53
+ "dist*",
54
+ "playground*",
55
+ "scripts*",
56
+ "tests*",
57
+ ]
58
+
59
+ [tool.wheel]
60
+ exclude = [
61
+ "assets*",
62
+ "benchmark*",
63
+ "docs*",
64
+ "dist*",
65
+ "playground*",
66
+ "scripts*",
67
+ "tests*",
68
+ ]
@@ -227,22 +227,24 @@ def extend(reqs, model_runner):
227
227
  req_to_token_pool=model_runner.req_to_token_pool,
228
228
  token_to_kv_pool=model_runner.token_to_kv_pool,
229
229
  tree_cache=None,
230
+ model_config=model_runner.model_config,
230
231
  )
231
- batch.prepare_for_extend(model_runner.model_config.vocab_size)
232
+ batch.prepare_for_extend()
232
233
  model_worker_batch = batch.get_model_worker_batch()
233
234
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
234
235
  logits_output = model_runner.forward(forward_batch)
235
- next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
236
+ next_token_ids = model_runner.sample(logits_output, forward_batch)
236
237
  return next_token_ids, logits_output.next_token_logits, batch
237
238
 
238
239
 
239
240
  @torch.inference_mode()
240
241
  def decode(input_token_ids, batch, model_runner):
241
- batch.prepare_for_decode(input_token_ids)
242
+ batch.output_ids = input_token_ids
243
+ batch.prepare_for_decode()
242
244
  model_worker_batch = batch.get_model_worker_batch()
243
245
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
244
246
  logits_output = model_runner.forward(forward_batch)
245
- next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
247
+ next_token_ids = model_runner.sample(logits_output, forward_batch)
246
248
  return next_token_ids, logits_output.next_token_logits
247
249
 
248
250
 
@@ -252,6 +254,7 @@ def correctness_test(
252
254
  bench_args,
253
255
  tp_rank,
254
256
  ):
257
+ configure_logger(server_args, prefix=f" TP{tp_rank}")
255
258
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
256
259
 
257
260
  # Load the model
@@ -279,8 +282,9 @@ def correctness_test(
279
282
  output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
280
283
  for _ in range(bench_args.output_len[0] - 1):
281
284
  next_token_ids, _ = decode(next_token_ids, batch, model_runner)
285
+ next_token_ids_list = next_token_ids.tolist()
282
286
  for i in range(len(reqs)):
283
- output_ids[i].append(next_token_ids[i])
287
+ output_ids[i].append(next_token_ids_list[i])
284
288
 
285
289
  # Print
286
290
  for i in range(len(reqs)):
@@ -288,8 +292,15 @@ def correctness_test(
288
292
  rank_print(tokenizer.decode(output_ids[i]), "\n")
289
293
 
290
294
 
295
+ def synchronize(device):
296
+ if device == "cuda":
297
+ torch.cuda.synchronize()
298
+ elif device == "xpu":
299
+ torch.xpu.synchronize()
300
+
301
+
291
302
  def latency_test_run_once(
292
- run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
303
+ run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
293
304
  ):
294
305
  max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
295
306
  if batch_size > max_batch_size:
@@ -312,10 +323,10 @@ def latency_test_run_once(
312
323
  tot_latency = 0
313
324
 
314
325
  # Prefill
315
- torch.cuda.synchronize()
326
+ synchronize(device)
316
327
  tic = time.time()
317
328
  next_token_ids, _, batch = extend(reqs, model_runner)
318
- torch.cuda.synchronize()
329
+ synchronize(device)
319
330
  prefill_latency = time.time() - tic
320
331
  tot_latency += prefill_latency
321
332
  throughput = input_len * batch_size / prefill_latency
@@ -328,10 +339,10 @@ def latency_test_run_once(
328
339
  # Decode
329
340
  decode_latencies = []
330
341
  for i in range(output_len - 1):
331
- torch.cuda.synchronize()
342
+ synchronize(device)
332
343
  tic = time.time()
333
344
  next_token_ids, _ = decode(next_token_ids, batch, model_runner)
334
- torch.cuda.synchronize()
345
+ synchronize(device)
335
346
  latency = time.time() - tic
336
347
  tot_latency += latency
337
348
  throughput = batch_size / latency
@@ -387,6 +398,7 @@ def latency_test(
387
398
  bench_args.batch_size[0],
388
399
  bench_args.input_len[0],
389
400
  8, # shorter decoding to speed up the warmup
401
+ server_args.device,
390
402
  )
391
403
  rank_print("Benchmark ...")
392
404
 
@@ -397,7 +409,14 @@ def latency_test(
397
409
  ):
398
410
  reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
399
411
  ret = latency_test_run_once(
400
- bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
412
+ bench_args.run_name,
413
+ model_runner,
414
+ rank_print,
415
+ reqs,
416
+ bs,
417
+ il,
418
+ ol,
419
+ server_args.device,
401
420
  )
402
421
  if ret is not None:
403
422
  result_list.append(ret)
@@ -6,6 +6,8 @@ It accepts arguments similar to those of launch_server.py.
6
6
  Usage:
7
7
 
8
8
  python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
9
+
10
+ python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
9
11
  """
10
12
 
11
13
  import argparse
@@ -32,6 +34,8 @@ class BenchArgs:
32
34
  input_len: Tuple[int] = (1024,)
33
35
  output_len: Tuple[int] = (16,)
34
36
  result_filename: str = "result.jsonl"
37
+ base_url: str = ""
38
+ skip_warmup: bool = False
35
39
 
36
40
  @staticmethod
37
41
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -48,6 +52,8 @@ class BenchArgs:
48
52
  parser.add_argument(
49
53
  "--result-filename", type=str, default=BenchArgs.result_filename
50
54
  )
55
+ parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
56
+ parser.add_argument("--skip-warmup", action="store_true")
51
57
 
52
58
  @classmethod
53
59
  def from_cli_args(cls, args: argparse.Namespace):
@@ -139,17 +145,21 @@ def run_one_case(
139
145
 
140
146
 
141
147
  def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
142
- proc, base_url = launch_server_process(server_args)
148
+ if bench_args.base_url:
149
+ proc, base_url = None, bench_args.base_url
150
+ else:
151
+ proc, base_url = launch_server_process(server_args)
143
152
 
144
153
  # warmup
145
- run_one_case(
146
- base_url,
147
- batch_size=16,
148
- input_len=1024,
149
- output_len=16,
150
- run_name="",
151
- result_filename="",
152
- )
154
+ if not bench_args.skip_warmup:
155
+ run_one_case(
156
+ base_url,
157
+ batch_size=16,
158
+ input_len=1024,
159
+ output_len=16,
160
+ run_name="",
161
+ result_filename="",
162
+ )
153
163
 
154
164
  # benchmark
155
165
  try:
@@ -165,7 +175,8 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
165
175
  bench_args.result_filename,
166
176
  )
167
177
  finally:
168
- kill_child_process(proc.pid)
178
+ if proc:
179
+ kill_child_process(proc.pid)
169
180
 
170
181
  print(f"\nResults are saved to {bench_args.result_filename}")
171
182