sglang 0.3.1.post1__tar.gz → 0.3.1.post3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. {sglang-0.3.1.post1/sglang.egg-info → sglang-0.3.1.post3}/PKG-INFO +5 -5
  2. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/README.md +4 -4
  3. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/pyproject.toml +1 -1
  4. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/bench_latency.py +11 -2
  5. sglang-0.3.1.post3/sglang/bench_server_latency.py +187 -0
  6. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/bench_serving.py +1 -1
  7. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/activation.py +8 -4
  8. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/attention_backend.py +3 -1
  9. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/layernorm.py +10 -7
  10. sglang-0.3.1.post3/sglang/srt/layers/linear.py +1133 -0
  11. sglang-0.3.1.post3/sglang/srt/layers/quantization/__init__.py +76 -0
  12. sglang-0.3.1.post3/sglang/srt/layers/quantization/base_config.py +122 -0
  13. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/sampler.py +9 -2
  14. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/managers/io_struct.py +3 -0
  15. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/managers/policy_scheduler.py +49 -93
  16. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/managers/schedule_batch.py +1 -1
  17. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/managers/tp_worker.py +11 -6
  18. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/model_executor/cuda_graph_runner.py +15 -14
  19. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/model_executor/model_runner.py +13 -5
  20. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/baichuan.py +1 -1
  21. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/chatglm.py +6 -6
  22. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/commandr.py +7 -7
  23. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/dbrx.py +7 -7
  24. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/deepseek.py +7 -7
  25. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/deepseek_v2.py +9 -9
  26. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/exaone.py +6 -6
  27. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/gemma.py +6 -6
  28. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/gemma2.py +6 -6
  29. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/gpt_bigcode.py +6 -6
  30. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/grok.py +6 -6
  31. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/internlm2.py +6 -6
  32. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/llama.py +7 -9
  33. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/llama_classification.py +3 -4
  34. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/llava.py +1 -1
  35. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/llavavid.py +1 -1
  36. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/minicpm.py +6 -6
  37. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/minicpm3.py +3 -3
  38. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/mixtral.py +6 -6
  39. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/mixtral_quant.py +6 -6
  40. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/olmoe.py +1 -1
  41. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/qwen.py +6 -6
  42. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/qwen2.py +6 -6
  43. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/qwen2_moe.py +7 -7
  44. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/stablelm.py +6 -6
  45. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/xverse.py +2 -4
  46. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/xverse_moe.py +2 -5
  47. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/yivl.py +1 -1
  48. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/server_args.py +17 -21
  49. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/utils.py +21 -1
  50. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/few_shot_gsm8k.py +8 -2
  51. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/test_utils.py +5 -2
  52. sglang-0.3.1.post3/sglang/version.py +1 -0
  53. {sglang-0.3.1.post1 → sglang-0.3.1.post3/sglang.egg-info}/PKG-INFO +5 -5
  54. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang.egg-info/SOURCES.txt +4 -0
  55. sglang-0.3.1.post1/sglang/version.py +0 -1
  56. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/LICENSE +0 -0
  57. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/setup.cfg +0 -0
  58. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/__init__.py +0 -0
  59. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/api.py +0 -0
  60. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/check_env.py +0 -0
  61. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/global_config.py +0 -0
  62. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/__init__.py +0 -0
  63. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/backend/__init__.py +0 -0
  64. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/backend/anthropic.py +0 -0
  65. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/backend/base_backend.py +0 -0
  66. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/backend/litellm.py +0 -0
  67. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/backend/openai.py +0 -0
  68. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/backend/runtime_endpoint.py +0 -0
  69. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/backend/vertexai.py +0 -0
  70. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/chat_template.py +0 -0
  71. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/choices.py +0 -0
  72. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/compiler.py +0 -0
  73. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/interpreter.py +0 -0
  74. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/ir.py +0 -0
  75. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/lang/tracer.py +0 -0
  76. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/launch_server.py +0 -0
  77. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/launch_server_llavavid.py +0 -0
  78. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/configs/__init__.py +0 -0
  79. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/configs/exaone.py +0 -0
  80. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/configs/model_config.py +0 -0
  81. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/constrained/__init__.py +0 -0
  82. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/constrained/base_tool_cache.py +0 -0
  83. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/constrained/fsm_cache.py +0 -0
  84. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/constrained/jump_forward.py +0 -0
  85. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/conversation.py +0 -0
  86. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/hf_transformers_utils.py +0 -0
  87. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/flashinfer_utils.py +0 -0
  88. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  89. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  90. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/fused_moe/layer.py +0 -0
  91. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/logits_processor.py +0 -0
  92. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/pooler.py +0 -0
  93. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/radix_attention.py +0 -0
  94. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/torchao_utils.py +0 -0
  95. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/triton_attention/decode_attention.py +0 -0
  96. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/triton_attention/extend_attention.py +0 -0
  97. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/layers/triton_attention/prefill_attention.py +0 -0
  98. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/lora/lora.py +0 -0
  99. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/lora/lora_config.py +0 -0
  100. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/lora/lora_manager.py +0 -0
  101. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/managers/controller_multi.py +0 -0
  102. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/managers/controller_single.py +0 -0
  103. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/managers/detokenizer_manager.py +0 -0
  104. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/managers/tokenizer_manager.py +0 -0
  105. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  106. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  107. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/mem_cache/flush_cache.py +0 -0
  108. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/mem_cache/memory_pool.py +0 -0
  109. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/mem_cache/radix_cache.py +0 -0
  110. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/mm_utils.py +0 -0
  111. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  112. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/llama_embedding.py +0 -0
  113. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/models/mistral.py +0 -0
  114. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/openai_api/adapter.py +0 -0
  115. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/openai_api/protocol.py +0 -0
  116. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  117. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  118. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  119. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  120. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  121. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  122. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  123. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/sampling/sampling_params.py +0 -0
  124. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/srt/server.py +0 -0
  125. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/run_eval.py +0 -0
  126. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/runners.py +0 -0
  127. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/simple_eval_common.py +0 -0
  128. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/simple_eval_gpqa.py +0 -0
  129. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/simple_eval_humaneval.py +0 -0
  130. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/simple_eval_math.py +0 -0
  131. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/simple_eval_mgsm.py +0 -0
  132. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/simple_eval_mmlu.py +0 -0
  133. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  134. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/test_activation.py +0 -0
  135. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/test_layernorm.py +0 -0
  136. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/test/test_programs.py +0 -0
  137. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang/utils.py +0 -0
  138. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang.egg-info/dependency_links.txt +0 -0
  139. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang.egg-info/requires.txt +0 -0
  140. {sglang-0.3.1.post1 → sglang-0.3.1.post3}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.1.post1
3
+ Version: 0.3.1.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -269,7 +269,7 @@ Requires-Dist: sglang[test]; extra == "dev"
269
269
 
270
270
  --------------------------------------------------------------------------------
271
271
 
272
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
272
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
273
273
 
274
274
  SGLang is a fast serving framework for large language models and vision language models.
275
275
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -278,7 +278,7 @@ The core features include:
278
278
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
279
279
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
280
280
  - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
281
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
281
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
282
282
 
283
283
  ## News
284
284
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
318
  ### Method 2: From source
319
319
  ```
320
320
  # Use the last release branch
321
- git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
321
+ git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
322
322
  cd sglang
323
323
 
324
324
  pip install --upgrade pip
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
483
483
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
484
484
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
485
485
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
486
- - To enable DeepSeek MLA acceleration, add `--enable-mla`.
487
486
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
488
487
  - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
489
488
  ```
@@ -500,6 +499,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
500
499
  - Llama / Llama 2 / Llama 3 / Llama 3.1
501
500
  - Mistral / Mixtral / Mistral NeMo
502
501
  - Gemma / Gemma 2
502
+ - OLMoE
503
503
  - Qwen / Qwen 2 / Qwen 2 MoE
504
504
  - DeepSeek / DeepSeek 2
505
505
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -11,7 +11,7 @@
11
11
 
12
12
  --------------------------------------------------------------------------------
13
13
 
14
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
14
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
15
15
 
16
16
  SGLang is a fast serving framework for large language models and vision language models.
17
17
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -20,7 +20,7 @@ The core features include:
20
20
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
21
21
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
22
22
  - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
23
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
23
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
24
24
 
25
25
  ## News
26
26
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -60,7 +60,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
60
60
  ### Method 2: From source
61
61
  ```
62
62
  # Use the last release branch
63
- git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
63
+ git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
64
64
  cd sglang
65
65
 
66
66
  pip install --upgrade pip
@@ -225,7 +225,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
225
225
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
226
226
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
227
227
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
228
- - To enable DeepSeek MLA acceleration, add `--enable-mla`.
229
228
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
230
229
  - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
231
230
  ```
@@ -242,6 +241,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
242
241
  - Llama / Llama 2 / Llama 3 / Llama 3.1
243
242
  - Mistral / Mixtral / Mistral NeMo
244
243
  - Gemma / Gemma 2
244
+ - OLMoE
245
245
  - Qwen / Qwen 2 / Qwen 2 MoE
246
246
  - DeepSeek / DeepSeek 2
247
247
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.1.post1"
7
+ version = "0.3.1.post3"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1,5 +1,7 @@
1
1
  """
2
- Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
2
+ Benchmark the latency of running a single static batch.
3
+ This script does not launch a server and uses the low-level APIs.
4
+ It accepts arguments similar to those of launch_server.py.
3
5
 
4
6
  # Usage (latency test)
5
7
  ## with dummy weights:
@@ -62,8 +64,13 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
62
64
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
63
65
  from sglang.srt.model_executor.model_runner import ModelRunner
64
66
  from sglang.srt.sampling.sampling_params import SamplingParams
67
+ from sglang.srt.server import _set_envs_and_config
65
68
  from sglang.srt.server_args import ServerArgs
66
- from sglang.srt.utils import kill_child_process, suppress_other_loggers
69
+ from sglang.srt.utils import (
70
+ configure_logger,
71
+ kill_child_process,
72
+ suppress_other_loggers,
73
+ )
67
74
 
68
75
 
69
76
  @dataclasses.dataclass
@@ -339,6 +346,8 @@ def latency_test(
339
346
  bench_args,
340
347
  tp_rank,
341
348
  ):
349
+ configure_logger(server_args, prefix=f" TP{tp_rank}")
350
+ _set_envs_and_config(server_args)
342
351
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
343
352
 
344
353
  # Load the model
@@ -0,0 +1,187 @@
1
+ """
2
+ Benchmark the latency of serving a single batch with a real server.
3
+ This script launches a server and uses the HTTP interface.
4
+ It accepts arguments similar to those of launch_server.py.
5
+
6
+ Usage:
7
+
8
+ python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
9
+ """
10
+
11
+ import argparse
12
+ import dataclasses
13
+ import itertools
14
+ import json
15
+ import multiprocessing
16
+ import os
17
+ import time
18
+ from typing import Tuple
19
+
20
+ import numpy as np
21
+ import requests
22
+
23
+ from sglang.srt.server import launch_server
24
+ from sglang.srt.server_args import ServerArgs
25
+ from sglang.srt.utils import kill_child_process
26
+
27
+
28
+ @dataclasses.dataclass
29
+ class BenchArgs:
30
+ run_name: str = "default"
31
+ batch_size: Tuple[int] = (1,)
32
+ input_len: Tuple[int] = (1024,)
33
+ output_len: Tuple[int] = (16,)
34
+ result_filename: str = "result.jsonl"
35
+
36
+ @staticmethod
37
+ def add_cli_args(parser: argparse.ArgumentParser):
38
+ parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
39
+ parser.add_argument(
40
+ "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
41
+ )
42
+ parser.add_argument(
43
+ "--input-len", type=int, nargs="+", default=BenchArgs.input_len
44
+ )
45
+ parser.add_argument(
46
+ "--output-len", type=int, nargs="+", default=BenchArgs.output_len
47
+ )
48
+ parser.add_argument(
49
+ "--result-filename", type=str, default=BenchArgs.result_filename
50
+ )
51
+
52
+ @classmethod
53
+ def from_cli_args(cls, args: argparse.Namespace):
54
+ # use the default value's type to case the args into correct types.
55
+ attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
56
+ return cls(
57
+ **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
58
+ )
59
+
60
+
61
+ def launch_server_internal(server_args):
62
+ try:
63
+ launch_server(server_args)
64
+ except Exception as e:
65
+ raise e
66
+ finally:
67
+ kill_child_process(os.getpid(), including_parent=False)
68
+
69
+
70
+ def launch_server_process(server_args: ServerArgs):
71
+ proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
72
+ proc.start()
73
+ base_url = f"http://{server_args.host}:{server_args.port}"
74
+ timeout = 600
75
+
76
+ start_time = time.time()
77
+ while time.time() - start_time < timeout:
78
+ try:
79
+ headers = {
80
+ "Content-Type": "application/json; charset=utf-8",
81
+ }
82
+ response = requests.get(f"{base_url}/v1/models", headers=headers)
83
+ if response.status_code == 200:
84
+ return proc, base_url
85
+ except requests.RequestException:
86
+ pass
87
+ time.sleep(10)
88
+ raise TimeoutError("Server failed to start within the timeout period.")
89
+
90
+
91
+ def run_one_case(
92
+ url: str,
93
+ batch_size: int,
94
+ input_len: int,
95
+ output_len: int,
96
+ run_name: str,
97
+ result_filename: str,
98
+ ):
99
+ input_ids = [
100
+ [int(x) for x in np.random.randint(0, high=16384, size=(input_len,))]
101
+ for _ in range(batch_size)
102
+ ]
103
+
104
+ tic = time.time()
105
+ response = requests.post(
106
+ url + "/generate",
107
+ json={
108
+ "input_ids": input_ids,
109
+ "sampling_params": {
110
+ "temperature": 0,
111
+ "max_new_tokens": output_len,
112
+ "ignore_eos": True,
113
+ },
114
+ },
115
+ )
116
+ latency = time.time() - tic
117
+
118
+ _ = response.json()
119
+ output_throughput = batch_size * output_len / latency
120
+ overall_throughput = batch_size * (input_len + output_len) / latency
121
+
122
+ print(f"batch size: {batch_size}")
123
+ print(f"latency: {latency:.2f} s")
124
+ print(f"output throughput: {output_throughput:.2f} token/s")
125
+ print(f"(input + output) throughput: {overall_throughput:.2f} token/s")
126
+
127
+ if result_filename:
128
+ with open(result_filename, "a") as fout:
129
+ res = {
130
+ "run_name": run_name,
131
+ "batch_size": batch_size,
132
+ "input_len": input_len,
133
+ "output_len": output_len,
134
+ "latency": round(latency, 4),
135
+ "output_throughput": round(output_throughput, 2),
136
+ "overall_throughput": round(overall_throughput, 2),
137
+ }
138
+ fout.write(json.dumps(res) + "\n")
139
+
140
+
141
+ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
142
+ proc, base_url = launch_server_process(server_args)
143
+
144
+ # warmup
145
+ run_one_case(
146
+ base_url,
147
+ batch_size=16,
148
+ input_len=1024,
149
+ output_len=16,
150
+ run_name="",
151
+ result_filename="",
152
+ )
153
+
154
+ # benchmark
155
+ try:
156
+ for bs, il, ol in itertools.product(
157
+ bench_args.batch_size, bench_args.input_len, bench_args.output_len
158
+ ):
159
+ run_one_case(
160
+ base_url,
161
+ bs,
162
+ il,
163
+ ol,
164
+ bench_args.run_name,
165
+ bench_args.result_filename,
166
+ )
167
+ finally:
168
+ kill_child_process(proc.pid)
169
+
170
+ print(f"\nResults are saved to {bench_args.result_filename}")
171
+
172
+
173
+ if __name__ == "__main__":
174
+ parser = argparse.ArgumentParser()
175
+ ServerArgs.add_cli_args(parser)
176
+ BenchArgs.add_cli_args(parser)
177
+ # For this script, model-path is not required
178
+ assert (
179
+ parser._actions[1].option_strings[0] == "--model-path"
180
+ ), "options changed, this code need to be updated"
181
+ parser._actions[1].required = False
182
+ args = parser.parse_args()
183
+
184
+ server_args = ServerArgs.from_cli_args(args)
185
+ bench_args = BenchArgs.from_cli_args(args)
186
+
187
+ run_benchmark(server_args, bench_args)
@@ -2,7 +2,7 @@
2
2
  # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
3
3
 
4
4
  """
5
- Benchmark online serving.
5
+ Benchmark online serving with dynamic requests.
6
6
 
7
7
  Usage:
8
8
  python3 -m sglang.bench_serving --backend sglang --num-prompt 10
@@ -19,17 +19,21 @@ from typing import Optional
19
19
  import torch
20
20
  import torch.nn as nn
21
21
  import torch.nn.functional as F
22
- from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
22
+
23
+ from sglang.srt.utils import is_hip
24
+
25
+ if not is_hip():
26
+ from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
27
+
23
28
  from vllm.distributed import (
24
29
  divide,
25
30
  get_tensor_model_parallel_rank,
26
31
  get_tensor_model_parallel_world_size,
27
32
  )
28
33
  from vllm.model_executor.custom_op import CustomOp
29
- from vllm.model_executor.layers.quantization import QuantizationConfig
30
- from vllm.model_executor.utils import set_weight_attrs
31
34
 
32
- from sglang.srt.utils import is_hip
35
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
36
+ from sglang.srt.utils import set_weight_attrs
33
37
 
34
38
  logger = logging.getLogger(__name__)
35
39
 
@@ -346,7 +346,9 @@ class TritonAttnBackend(AttentionBackend):
346
346
 
347
347
  self.decode_attention_fwd = decode_attention_fwd
348
348
  self.extend_attention_fwd = extend_attention_fwd
349
- self.num_head = model_runner.model_config.num_attention_heads
349
+ self.num_head = (
350
+ model_runner.model_config.num_attention_heads // model_runner.tp_size
351
+ )
350
352
 
351
353
  if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
352
354
  self.reduce_dtype = torch.float32
@@ -20,16 +20,19 @@ from typing import Optional, Tuple, Union
20
20
 
21
21
  import torch
22
22
  import torch.nn as nn
23
- from flashinfer.norm import (
24
- fused_add_rmsnorm,
25
- gemma_fused_add_rmsnorm,
26
- gemma_rmsnorm,
27
- rmsnorm,
28
- )
29
- from vllm.model_executor.custom_op import CustomOp
30
23
 
31
24
  from sglang.srt.utils import is_hip
32
25
 
26
+ if not is_hip():
27
+ from flashinfer.norm import (
28
+ fused_add_rmsnorm,
29
+ gemma_fused_add_rmsnorm,
30
+ gemma_rmsnorm,
31
+ rmsnorm,
32
+ )
33
+
34
+ from vllm.model_executor.custom_op import CustomOp
35
+
33
36
  logger = logging.getLogger(__name__)
34
37
 
35
38