sglang 0.3.1.post1__tar.gz → 0.3.1.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {sglang-0.3.1.post1/sglang.egg-info → sglang-0.3.1.post2}/PKG-INFO +4 -5
  2. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/README.md +3 -4
  3. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/pyproject.toml +1 -1
  4. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/bench_latency.py +3 -1
  5. sglang-0.3.1.post2/sglang/bench_server_latency.py +187 -0
  6. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/bench_serving.py +1 -1
  7. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/activation.py +6 -3
  8. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/layernorm.py +10 -7
  9. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/sampler.py +9 -2
  10. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/io_struct.py +3 -0
  11. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/policy_scheduler.py +49 -93
  12. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/schedule_batch.py +1 -1
  13. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/tp_worker.py +11 -6
  14. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/model_executor/cuda_graph_runner.py +15 -14
  15. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/model_executor/model_runner.py +13 -5
  16. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/deepseek_v2.py +2 -2
  17. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llama.py +1 -3
  18. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llama_classification.py +2 -3
  19. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/minicpm3.py +2 -2
  20. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/xverse.py +1 -3
  21. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/xverse_moe.py +1 -4
  22. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/server_args.py +17 -21
  23. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/few_shot_gsm8k.py +8 -2
  24. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/test_utils.py +1 -0
  25. sglang-0.3.1.post2/sglang/version.py +1 -0
  26. {sglang-0.3.1.post1 → sglang-0.3.1.post2/sglang.egg-info}/PKG-INFO +4 -5
  27. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang.egg-info/SOURCES.txt +1 -0
  28. sglang-0.3.1.post1/sglang/version.py +0 -1
  29. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/LICENSE +0 -0
  30. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/setup.cfg +0 -0
  31. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/__init__.py +0 -0
  32. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/api.py +0 -0
  33. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/check_env.py +0 -0
  34. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/global_config.py +0 -0
  35. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/__init__.py +0 -0
  36. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/__init__.py +0 -0
  37. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/anthropic.py +0 -0
  38. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/base_backend.py +0 -0
  39. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/litellm.py +0 -0
  40. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/openai.py +0 -0
  41. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
  42. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/vertexai.py +0 -0
  43. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/chat_template.py +0 -0
  44. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/choices.py +0 -0
  45. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/compiler.py +0 -0
  46. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/interpreter.py +0 -0
  47. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/ir.py +0 -0
  48. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/tracer.py +0 -0
  49. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/launch_server.py +0 -0
  50. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/launch_server_llavavid.py +0 -0
  51. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/configs/__init__.py +0 -0
  52. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/configs/exaone.py +0 -0
  53. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/configs/model_config.py +0 -0
  54. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/constrained/__init__.py +0 -0
  55. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/constrained/base_tool_cache.py +0 -0
  56. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/constrained/fsm_cache.py +0 -0
  57. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/constrained/jump_forward.py +0 -0
  58. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/conversation.py +0 -0
  59. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/hf_transformers_utils.py +0 -0
  60. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/attention_backend.py +0 -0
  61. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/flashinfer_utils.py +0 -0
  62. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  63. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  64. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/fused_moe/layer.py +0 -0
  65. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/logits_processor.py +0 -0
  66. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/pooler.py +0 -0
  67. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/radix_attention.py +0 -0
  68. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  69. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/triton_attention/decode_attention.py +0 -0
  70. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/triton_attention/extend_attention.py +0 -0
  71. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/triton_attention/prefill_attention.py +0 -0
  72. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/lora/lora.py +0 -0
  73. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/lora/lora_config.py +0 -0
  74. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/lora/lora_manager.py +0 -0
  75. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/controller_multi.py +0 -0
  76. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/controller_single.py +0 -0
  77. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/detokenizer_manager.py +0 -0
  78. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/tokenizer_manager.py +0 -0
  79. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  80. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  81. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  82. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/memory_pool.py +0 -0
  83. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
  84. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mm_utils.py +0 -0
  85. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  86. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/baichuan.py +0 -0
  87. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/chatglm.py +0 -0
  88. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/commandr.py +0 -0
  89. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/dbrx.py +0 -0
  90. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/deepseek.py +0 -0
  91. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/exaone.py +0 -0
  92. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/gemma.py +0 -0
  93. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/gemma2.py +0 -0
  94. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
  95. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/grok.py +0 -0
  96. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/internlm2.py +0 -0
  97. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llama_embedding.py +0 -0
  98. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llava.py +0 -0
  99. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llavavid.py +0 -0
  100. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/minicpm.py +0 -0
  101. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/mistral.py +0 -0
  102. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/mixtral.py +0 -0
  103. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/mixtral_quant.py +0 -0
  104. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/olmoe.py +0 -0
  105. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/qwen.py +0 -0
  106. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/qwen2.py +0 -0
  107. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/qwen2_moe.py +0 -0
  108. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/stablelm.py +0 -0
  109. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/yivl.py +0 -0
  110. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/openai_api/adapter.py +0 -0
  111. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/openai_api/protocol.py +0 -0
  112. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  113. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  114. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  115. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  116. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  117. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  118. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  119. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/sampling_params.py +0 -0
  120. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/server.py +0 -0
  121. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/utils.py +0 -0
  122. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/run_eval.py +0 -0
  123. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/runners.py +0 -0
  124. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_common.py +0 -0
  125. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  126. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  127. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_math.py +0 -0
  128. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  129. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  130. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  131. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/test_activation.py +0 -0
  132. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/test_layernorm.py +0 -0
  133. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/test_programs.py +0 -0
  134. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/utils.py +0 -0
  135. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang.egg-info/dependency_links.txt +0 -0
  136. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang.egg-info/requires.txt +0 -0
  137. {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.1.post1
3
+ Version: 0.3.1.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -269,7 +269,7 @@ Requires-Dist: sglang[test]; extra == "dev"
269
269
 
270
270
  --------------------------------------------------------------------------------
271
271
 
272
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
272
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
273
273
 
274
274
  SGLang is a fast serving framework for large language models and vision language models.
275
275
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -278,7 +278,7 @@ The core features include:
278
278
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
279
279
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
280
280
  - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
281
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
281
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
282
282
 
283
283
  ## News
284
284
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
318
  ### Method 2: From source
319
319
  ```
320
320
  # Use the last release branch
321
- git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
321
+ git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
322
322
  cd sglang
323
323
 
324
324
  pip install --upgrade pip
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
483
483
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
484
484
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
485
485
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
486
- - To enable DeepSeek MLA acceleration, add `--enable-mla`.
487
486
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
488
487
  - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
489
488
  ```
@@ -11,7 +11,7 @@
11
11
 
12
12
  --------------------------------------------------------------------------------
13
13
 
14
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
14
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
15
15
 
16
16
  SGLang is a fast serving framework for large language models and vision language models.
17
17
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -20,7 +20,7 @@ The core features include:
20
20
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
21
21
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
22
22
  - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
23
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
23
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
24
24
 
25
25
  ## News
26
26
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -60,7 +60,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
60
60
  ### Method 2: From source
61
61
  ```
62
62
  # Use the last release branch
63
- git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
63
+ git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
64
64
  cd sglang
65
65
 
66
66
  pip install --upgrade pip
@@ -225,7 +225,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
225
225
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
226
226
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
227
227
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
228
- - To enable DeepSeek MLA acceleration, add `--enable-mla`.
229
228
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
230
229
  - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
231
230
  ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.1.post1"
7
+ version = "0.3.1.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1,5 +1,7 @@
1
1
  """
2
- Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
2
+ Benchmark the latency of running a single static batch.
3
+ This script does not launch a server and uses the low-level APIs.
4
+ It accepts arguments similar to those of launch_server.py.
3
5
 
4
6
  # Usage (latency test)
5
7
  ## with dummy weights:
@@ -0,0 +1,187 @@
1
+ """
2
+ Benchmark the latency of serving a single batch with a real server.
3
+ This script launches a server and uses the HTTP interface.
4
+ It accepts arguments similar to those of launch_server.py.
5
+
6
+ Usage:
7
+
8
+ python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
9
+ """
10
+
11
+ import argparse
12
+ import dataclasses
13
+ import itertools
14
+ import json
15
+ import multiprocessing
16
+ import os
17
+ import time
18
+ from typing import Tuple
19
+
20
+ import numpy as np
21
+ import requests
22
+
23
+ from sglang.srt.server import launch_server
24
+ from sglang.srt.server_args import ServerArgs
25
+ from sglang.srt.utils import kill_child_process
26
+
27
+
28
+ @dataclasses.dataclass
29
+ class BenchArgs:
30
+ run_name: str = "default"
31
+ batch_size: Tuple[int] = (1,)
32
+ input_len: Tuple[int] = (1024,)
33
+ output_len: Tuple[int] = (16,)
34
+ result_filename: str = "result.jsonl"
35
+
36
+ @staticmethod
37
+ def add_cli_args(parser: argparse.ArgumentParser):
38
+ parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
39
+ parser.add_argument(
40
+ "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
41
+ )
42
+ parser.add_argument(
43
+ "--input-len", type=int, nargs="+", default=BenchArgs.input_len
44
+ )
45
+ parser.add_argument(
46
+ "--output-len", type=int, nargs="+", default=BenchArgs.output_len
47
+ )
48
+ parser.add_argument(
49
+ "--result-filename", type=str, default=BenchArgs.result_filename
50
+ )
51
+
52
+ @classmethod
53
+ def from_cli_args(cls, args: argparse.Namespace):
54
+ # use the default value's type to case the args into correct types.
55
+ attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
56
+ return cls(
57
+ **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
58
+ )
59
+
60
+
61
+ def launch_server_internal(server_args):
62
+ try:
63
+ launch_server(server_args)
64
+ except Exception as e:
65
+ raise e
66
+ finally:
67
+ kill_child_process(os.getpid(), including_parent=False)
68
+
69
+
70
+ def launch_server_process(server_args: ServerArgs):
71
+ proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
72
+ proc.start()
73
+ base_url = f"http://{server_args.host}:{server_args.port}"
74
+ timeout = 600
75
+
76
+ start_time = time.time()
77
+ while time.time() - start_time < timeout:
78
+ try:
79
+ headers = {
80
+ "Content-Type": "application/json; charset=utf-8",
81
+ }
82
+ response = requests.get(f"{base_url}/v1/models", headers=headers)
83
+ if response.status_code == 200:
84
+ return proc, base_url
85
+ except requests.RequestException:
86
+ pass
87
+ time.sleep(10)
88
+ raise TimeoutError("Server failed to start within the timeout period.")
89
+
90
+
91
+ def run_one_case(
92
+ url: str,
93
+ batch_size: int,
94
+ input_len: int,
95
+ output_len: int,
96
+ run_name: str,
97
+ result_filename: str,
98
+ ):
99
+ input_ids = [
100
+ [int(x) for x in np.random.randint(0, high=16384, size=(input_len,))]
101
+ for _ in range(batch_size)
102
+ ]
103
+
104
+ tic = time.time()
105
+ response = requests.post(
106
+ url + "/generate",
107
+ json={
108
+ "input_ids": input_ids,
109
+ "sampling_params": {
110
+ "temperature": 0,
111
+ "max_new_tokens": output_len,
112
+ "ignore_eos": True,
113
+ },
114
+ },
115
+ )
116
+ latency = time.time() - tic
117
+
118
+ _ = response.json()
119
+ output_throughput = batch_size * output_len / latency
120
+ overall_throughput = batch_size * (input_len + output_len) / latency
121
+
122
+ print(f"batch size: {batch_size}")
123
+ print(f"latency: {latency:.2f} s")
124
+ print(f"output throughput: {output_throughput:.2f} token/s")
125
+ print(f"(input + output) throughput: {overall_throughput:.2f} token/s")
126
+
127
+ if result_filename:
128
+ with open(result_filename, "a") as fout:
129
+ res = {
130
+ "run_name": run_name,
131
+ "batch_size": batch_size,
132
+ "input_len": input_len,
133
+ "output_len": output_len,
134
+ "latency": round(latency, 4),
135
+ "output_throughput": round(output_throughput, 2),
136
+ "overall_throughput": round(overall_throughput, 2),
137
+ }
138
+ fout.write(json.dumps(res) + "\n")
139
+
140
+
141
+ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
142
+ proc, base_url = launch_server_process(server_args)
143
+
144
+ # warmup
145
+ run_one_case(
146
+ base_url,
147
+ batch_size=16,
148
+ input_len=1024,
149
+ output_len=16,
150
+ run_name="",
151
+ result_filename="",
152
+ )
153
+
154
+ # benchmark
155
+ try:
156
+ for bs, il, ol in itertools.product(
157
+ bench_args.batch_size, bench_args.input_len, bench_args.output_len
158
+ ):
159
+ run_one_case(
160
+ base_url,
161
+ bs,
162
+ il,
163
+ ol,
164
+ bench_args.run_name,
165
+ bench_args.result_filename,
166
+ )
167
+ finally:
168
+ kill_child_process(proc.pid)
169
+
170
+ print(f"\nResults are saved to {bench_args.result_filename}")
171
+
172
+
173
+ if __name__ == "__main__":
174
+ parser = argparse.ArgumentParser()
175
+ ServerArgs.add_cli_args(parser)
176
+ BenchArgs.add_cli_args(parser)
177
+ # For this script, model-path is not required
178
+ assert (
179
+ parser._actions[1].option_strings[0] == "--model-path"
180
+ ), "options changed, this code need to be updated"
181
+ parser._actions[1].required = False
182
+ args = parser.parse_args()
183
+
184
+ server_args = ServerArgs.from_cli_args(args)
185
+ bench_args = BenchArgs.from_cli_args(args)
186
+
187
+ run_benchmark(server_args, bench_args)
@@ -2,7 +2,7 @@
2
2
  # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
3
3
 
4
4
  """
5
- Benchmark online serving.
5
+ Benchmark online serving with dynamic requests.
6
6
 
7
7
  Usage:
8
8
  python3 -m sglang.bench_serving --backend sglang --num-prompt 10
@@ -19,7 +19,12 @@ from typing import Optional
19
19
  import torch
20
20
  import torch.nn as nn
21
21
  import torch.nn.functional as F
22
- from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
22
+
23
+ from sglang.srt.utils import is_hip
24
+
25
+ if not is_hip():
26
+ from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
27
+
23
28
  from vllm.distributed import (
24
29
  divide,
25
30
  get_tensor_model_parallel_rank,
@@ -29,8 +34,6 @@ from vllm.model_executor.custom_op import CustomOp
29
34
  from vllm.model_executor.layers.quantization import QuantizationConfig
30
35
  from vllm.model_executor.utils import set_weight_attrs
31
36
 
32
- from sglang.srt.utils import is_hip
33
-
34
37
  logger = logging.getLogger(__name__)
35
38
 
36
39
 
@@ -20,16 +20,19 @@ from typing import Optional, Tuple, Union
20
20
 
21
21
  import torch
22
22
  import torch.nn as nn
23
- from flashinfer.norm import (
24
- fused_add_rmsnorm,
25
- gemma_fused_add_rmsnorm,
26
- gemma_rmsnorm,
27
- rmsnorm,
28
- )
29
- from vllm.model_executor.custom_op import CustomOp
30
23
 
31
24
  from sglang.srt.utils import is_hip
32
25
 
26
+ if not is_hip():
27
+ from flashinfer.norm import (
28
+ fused_add_rmsnorm,
29
+ gemma_fused_add_rmsnorm,
30
+ gemma_rmsnorm,
31
+ rmsnorm,
32
+ )
33
+
34
+ from vllm.model_executor.custom_op import CustomOp
35
+
33
36
  logger = logging.getLogger(__name__)
34
37
 
35
38
 
@@ -31,8 +31,11 @@ class Sampler(nn.Module):
31
31
  logits = logits.next_token_logits
32
32
 
33
33
  # Post process logits
34
+ logits = logits.contiguous()
34
35
  logits.div_(sampling_info.temperatures)
35
- probs = logits[:] = torch.softmax(logits, dim=-1)
36
+ probs = torch.softmax(logits, dim=-1)
37
+ logits = None
38
+ del logits
36
39
 
37
40
  if torch.any(torch.isnan(probs)):
38
41
  logger.warning("Detected errors during sampling! NaN in the probability.")
@@ -53,7 +56,11 @@ class Sampler(nn.Module):
53
56
  )
54
57
  else:
55
58
  batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
56
- probs, uniform_samples, sampling_info.top_ks, sampling_info.top_ps
59
+ probs,
60
+ uniform_samples,
61
+ sampling_info.top_ks,
62
+ sampling_info.top_ps,
63
+ filter_apply_order="joint",
57
64
  )
58
65
 
59
66
  if not torch.all(success):
@@ -133,6 +133,9 @@ class GenerateReqInput:
133
133
  self.image_data = [None] * num
134
134
  elif not isinstance(self.image_data, list):
135
135
  self.image_data = [self.image_data] * num
136
+ elif isinstance(self.image_data, list):
137
+ # multi-image with n > 1
138
+ self.image_data = self.image_data * num
136
139
 
137
140
  if self.sampling_params is None:
138
141
  self.sampling_params = [{}] * num
@@ -119,19 +119,32 @@ class PrefillAdder:
119
119
  self.running_batch = running_batch
120
120
  self.new_token_ratio = new_token_ratio
121
121
  self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens
122
- self.rem_total_tokens_ = self.rem_total_tokens
123
- self.total_tokens = rem_total_tokens
124
122
  self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
125
123
  self.rem_chunk_tokens = rem_chunk_tokens
126
124
  if self.rem_chunk_tokens is not None:
127
125
  self.rem_chunk_tokens -= mixed_with_decode_tokens
128
126
 
127
+ self.cur_rem_tokens = rem_total_tokens - mixed_with_decode_tokens
128
+
129
129
  self.req_states = None
130
130
  self.can_run_list = []
131
131
  self.new_inflight_req = None
132
132
  self.log_hit_tokens = 0
133
133
  self.log_input_tokens = 0
134
134
 
135
+ if running_batch is not None:
136
+ # Pre-remove the tokens which will be occupied by the running requests
137
+ self.rem_total_tokens -= sum(
138
+ [
139
+ min(
140
+ (r.sampling_params.max_new_tokens - len(r.output_ids)),
141
+ CLIP_MAX_NEW_TOKENS,
142
+ )
143
+ * self.new_token_ratio
144
+ for r in running_batch.reqs
145
+ ]
146
+ )
147
+
135
148
  def no_remaining_tokens(self):
136
149
  return (
137
150
  self.rem_total_tokens <= 0
@@ -141,31 +154,14 @@ class PrefillAdder:
141
154
  if self.rem_chunk_tokens is not None
142
155
  else False
143
156
  )
144
- )
145
-
146
- def remove_running_tokens(self, running_batch: ScheduleBatch):
147
- self.rem_total_tokens -= sum(
148
- [
149
- min(
150
- (r.sampling_params.max_new_tokens - len(r.output_ids)),
151
- CLIP_MAX_NEW_TOKENS,
152
- )
153
- * self.new_token_ratio
154
- for r in running_batch.reqs
155
- ]
156
- )
157
- self.rem_total_tokens_ -= sum(
158
- [
159
- r.sampling_params.max_new_tokens - len(r.output_ids)
160
- for r in running_batch.reqs
161
- ]
157
+ or self.cur_rem_tokens <= 0
162
158
  )
163
159
 
164
160
  def _prefill_one_req(
165
161
  self, prefix_len: int, extend_input_len: int, max_new_tokens: int
166
162
  ):
167
163
  self.rem_total_tokens -= extend_input_len + max_new_tokens
168
- self.rem_total_tokens_ -= extend_input_len + max_new_tokens
164
+ self.cur_rem_tokens -= extend_input_len
169
165
  self.rem_input_tokens -= extend_input_len
170
166
  if self.rem_chunk_tokens is not None:
171
167
  self.rem_chunk_tokens -= extend_input_len
@@ -173,29 +169,7 @@ class PrefillAdder:
173
169
  self.log_hit_tokens += prefix_len
174
170
  self.log_input_tokens += extend_input_len
175
171
 
176
- def add_inflight_req_ignore_eos(self, req: Req):
177
- truncated = req.extend_input_len > self.rem_chunk_tokens
178
- req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
179
- req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
180
- self.can_run_list.append(req)
181
-
182
- self._prefill_one_req(
183
- 0,
184
- req.extend_input_len,
185
- (
186
- min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
187
- if not truncated
188
- else 0
189
- ),
190
- )
191
-
192
- # Return if chunked prefill not finished
193
- return req if truncated else None
194
-
195
172
  def add_inflight_req(self, req: Req):
196
- if req.sampling_params.ignore_eos:
197
- return self.add_inflight_req_ignore_eos(req)
198
-
199
173
  truncated = req.extend_input_len > self.rem_chunk_tokens
200
174
  req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
201
175
  req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
@@ -225,7 +199,7 @@ class PrefillAdder:
225
199
  self.rem_total_tokens += delta
226
200
 
227
201
  def add_one_req_ignore_eos(self, req: Req):
228
- def get_req_state(r):
202
+ def add_req_state(r, insert_sort=False):
229
203
  new_token_ratio = (
230
204
  1.0 if r.sampling_params.ignore_eos else self.new_token_ratio
231
205
  )
@@ -235,56 +209,38 @@ class PrefillAdder:
235
209
  tokens_occupied = len(r.origin_input_ids) + len(r.output_ids)
236
210
 
237
211
  if tokens_left > 0:
238
- return (tokens_left, tokens_occupied)
239
-
240
- return None
241
-
242
- # Quick Check
243
- can_run = False
244
- if (
245
- req.extend_input_len + req.sampling_params.max_new_tokens
246
- <= self.rem_total_tokens
247
- ):
248
- can_run = True
249
-
250
- if not can_run:
251
- if self.req_states is None:
252
- self.req_states = []
253
- if self.running_batch is not None:
254
- for r in self.running_batch.reqs:
255
- state = get_req_state(r)
256
- if state is not None:
257
- self.req_states.append(state)
258
- for r in self.can_run_list:
259
- state = get_req_state(r)
260
- if state is not None:
261
- self.req_states.append(state)
262
- state = get_req_state(req)
263
- if state is not None:
264
- self.req_states.append(state)
265
-
266
- self.req_states.sort(key=lambda x: x[0])
267
- else:
268
- state = get_req_state(req)
269
- if state is not None:
270
- for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
271
- if tokens_left >= state[0]:
272
- self.req_states.insert(i, state)
212
+ if not insert_sort:
213
+ self.req_states.append((tokens_left, tokens_occupied))
214
+ else:
215
+ for i in range(len(self.req_states)):
216
+ if tokens_left <= self.req_states[i][0]:
273
217
  break
274
- else:
275
- self.req_states.append(state)
276
-
277
- tokens_freed = 0
278
- for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
279
- decode_steps = (
280
- self.req_states[i + 1][0]
281
- if i + 1 < len(self.req_states)
282
- else tokens_left
283
- )
284
- bs = len(self.req_states) - i
285
- if self.total_tokens + tokens_freed - decode_steps * bs <= 0:
286
- return False
287
- tokens_freed += tokens_occupied
218
+ self.req_states.insert(i, (tokens_left, tokens_occupied))
219
+
220
+ if self.req_states is None:
221
+ self.req_states = []
222
+ add_req_state(req)
223
+ if self.running_batch is not None:
224
+ for r in self.running_batch.reqs:
225
+ add_req_state(r)
226
+ for r in self.can_run_list:
227
+ add_req_state(r)
228
+ self.req_states.sort(key=lambda x: x[0])
229
+ else:
230
+ add_req_state(req, insert_sort=True)
231
+
232
+ cur_rem_tokens = self.cur_rem_tokens - len(req.origin_input_ids)
233
+ tokens_freed = 0
234
+ for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
235
+ decode_steps = (
236
+ self.req_states[i + 1][0]
237
+ if i + 1 < len(self.req_states)
238
+ else tokens_left
239
+ )
240
+ bs = len(self.req_states) - i
241
+ if cur_rem_tokens + tokens_freed - decode_steps * bs <= 0:
242
+ return False
243
+ tokens_freed += tokens_occupied
288
244
 
289
245
  if req.extend_input_len <= self.rem_chunk_tokens:
290
246
  self.can_run_list.append(req)
@@ -40,7 +40,7 @@ global_server_args_dict = {
40
40
  "attention_backend": ServerArgs.attention_backend,
41
41
  "sampling_backend": ServerArgs.sampling_backend,
42
42
  "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
43
- "enable_mla": ServerArgs.enable_mla,
43
+ "disable_mla": ServerArgs.disable_mla,
44
44
  "torchao_config": ServerArgs.torchao_config,
45
45
  }
46
46
 
@@ -445,9 +445,6 @@ class ModelTpServer:
445
445
  num_mixed_running,
446
446
  )
447
447
 
448
- if self.running_batch is not None:
449
- adder.remove_running_tokens(self.running_batch)
450
-
451
448
  has_inflight = self.current_inflight_req is not None
452
449
  if self.current_inflight_req is not None:
453
450
  self.current_inflight_req.init_next_round_input(
@@ -465,9 +462,6 @@ class ModelTpServer:
465
462
  )
466
463
 
467
464
  for req in self.waiting_queue:
468
- if adder.no_remaining_tokens():
469
- break
470
- req.init_next_round_input(None if prefix_computed else self.tree_cache)
471
465
  if (
472
466
  self.lora_paths is not None
473
467
  and len(
@@ -478,6 +472,10 @@ class ModelTpServer:
478
472
  > self.max_loras_per_batch
479
473
  ):
480
474
  break
475
+
476
+ if adder.no_remaining_tokens():
477
+ break
478
+ req.init_next_round_input(None if prefix_computed else self.tree_cache)
481
479
  res = adder.add_one_req(req)
482
480
  if (
483
481
  not res
@@ -507,6 +505,11 @@ class ModelTpServer:
507
505
  else:
508
506
  tree_cache_hit_rate = 0.0
509
507
 
508
+ num_used = self.max_total_num_tokens - (
509
+ self.token_to_kv_pool.available_size()
510
+ + self.tree_cache.evictable_size()
511
+ )
512
+
510
513
  if num_mixed_running > 0:
511
514
  logger.info(
512
515
  f"Prefill batch"
@@ -515,6 +518,7 @@ class ModelTpServer:
515
518
  f"#new-token: {adder.log_input_tokens}, "
516
519
  f"#cached-token: {adder.log_hit_tokens}, "
517
520
  f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
521
+ f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
518
522
  f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
519
523
  )
520
524
  else:
@@ -524,6 +528,7 @@ class ModelTpServer:
524
528
  f"#new-token: {adder.log_input_tokens}, "
525
529
  f"#cached-token: {adder.log_hit_tokens}, "
526
530
  f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
531
+ f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
527
532
  f"#running-req: {running_bs}, "
528
533
  f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
529
534
  )