sglang 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. {sglang-0.3.2/sglang.egg-info → sglang-0.3.3}/PKG-INFO +37 -19
  2. {sglang-0.3.2 → sglang-0.3.3}/README.md +35 -18
  3. {sglang-0.3.2 → sglang-0.3.3}/pyproject.toml +2 -2
  4. {sglang-0.3.2 → sglang-0.3.3}/sglang/__init__.py +2 -0
  5. {sglang-0.3.2 → sglang-0.3.3}/sglang/api.py +23 -1
  6. {sglang-0.3.2 → sglang-0.3.3}/sglang/bench_latency.py +46 -25
  7. {sglang-0.3.2 → sglang-0.3.3}/sglang/bench_serving.py +2 -2
  8. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/runtime_endpoint.py +14 -1
  9. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/interpreter.py +16 -6
  10. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/ir.py +20 -4
  11. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/configs/model_config.py +11 -9
  12. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/constrained/fsm_cache.py +9 -1
  13. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/constrained/jump_forward.py +15 -2
  14. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/activation.py +4 -4
  15. sglang-0.3.3/sglang/srt/layers/attention/__init__.py +49 -0
  16. sglang-0.3.3/sglang/srt/layers/attention/flashinfer_backend.py +277 -0
  17. {sglang-0.3.2/sglang/srt/layers → sglang-0.3.3/sglang/srt/layers/attention}/flashinfer_utils.py +82 -80
  18. sglang-0.3.3/sglang/srt/layers/attention/triton_backend.py +161 -0
  19. {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3/sglang/srt/layers/attention/triton_ops}/extend_attention.py +3 -1
  20. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/layernorm.py +4 -4
  21. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/logits_processor.py +19 -15
  22. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/pooler.py +3 -3
  23. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/quantization/__init__.py +0 -2
  24. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/radix_attention.py +6 -4
  25. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/sampler.py +6 -4
  26. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/torchao_utils.py +18 -0
  27. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/lora/lora.py +20 -21
  28. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/lora/lora_manager.py +97 -25
  29. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/managers/detokenizer_manager.py +31 -18
  30. sglang-0.3.3/sglang/srt/managers/image_processor.py +187 -0
  31. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/managers/io_struct.py +99 -75
  32. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/managers/schedule_batch.py +184 -63
  33. sglang-0.3.2/sglang/srt/managers/policy_scheduler.py → sglang-0.3.3/sglang/srt/managers/schedule_policy.py +31 -21
  34. sglang-0.3.2/sglang/srt/managers/tp_worker.py → sglang-0.3.3/sglang/srt/managers/scheduler.py +379 -383
  35. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/managers/tokenizer_manager.py +120 -248
  36. sglang-0.3.3/sglang/srt/managers/tp_worker.py +128 -0
  37. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/memory_pool.py +34 -52
  38. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/model_executor/cuda_graph_runner.py +15 -19
  39. sglang-0.3.3/sglang/srt/model_executor/forward_batch_info.py +173 -0
  40. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/model_executor/model_runner.py +76 -75
  41. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/baichuan.py +10 -10
  42. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/chatglm.py +12 -12
  43. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/commandr.py +10 -10
  44. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/dbrx.py +12 -12
  45. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/deepseek.py +10 -10
  46. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/deepseek_v2.py +14 -15
  47. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/exaone.py +10 -10
  48. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/gemma.py +10 -10
  49. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/gemma2.py +11 -11
  50. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/gpt_bigcode.py +10 -10
  51. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/grok.py +10 -10
  52. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/internlm2.py +10 -10
  53. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llama.py +14 -10
  54. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llama_classification.py +5 -5
  55. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llama_embedding.py +4 -4
  56. sglang-0.3.3/sglang/srt/models/llama_reward.py +142 -0
  57. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llava.py +39 -33
  58. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llavavid.py +31 -28
  59. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/minicpm.py +10 -10
  60. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/minicpm3.py +14 -15
  61. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/mixtral.py +10 -10
  62. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/mixtral_quant.py +10 -10
  63. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/olmoe.py +10 -10
  64. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/qwen.py +10 -10
  65. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/qwen2.py +11 -11
  66. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/qwen2_moe.py +10 -10
  67. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/stablelm.py +10 -10
  68. sglang-0.3.3/sglang/srt/models/torch_native_llama.py +506 -0
  69. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/xverse.py +10 -10
  70. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/xverse_moe.py +10 -10
  71. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/sampling_batch_info.py +36 -27
  72. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/sampling_params.py +3 -1
  73. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/server.py +170 -119
  74. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/server_args.py +54 -27
  75. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/utils.py +101 -128
  76. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/runners.py +71 -26
  77. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/test_programs.py +38 -5
  78. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/test_utils.py +18 -9
  79. sglang-0.3.3/sglang/version.py +1 -0
  80. {sglang-0.3.2 → sglang-0.3.3/sglang.egg-info}/PKG-INFO +37 -19
  81. {sglang-0.3.2 → sglang-0.3.3}/sglang.egg-info/SOURCES.txt +12 -8
  82. {sglang-0.3.2 → sglang-0.3.3}/sglang.egg-info/requires.txt +1 -0
  83. sglang-0.3.2/sglang/srt/layers/attention_backend.py +0 -474
  84. sglang-0.3.2/sglang/srt/managers/controller_multi.py +0 -207
  85. sglang-0.3.2/sglang/srt/managers/controller_single.py +0 -164
  86. sglang-0.3.2/sglang/srt/model_executor/forward_batch_info.py +0 -174
  87. sglang-0.3.2/sglang/version.py +0 -1
  88. {sglang-0.3.2 → sglang-0.3.3}/LICENSE +0 -0
  89. {sglang-0.3.2 → sglang-0.3.3}/setup.cfg +0 -0
  90. {sglang-0.3.2 → sglang-0.3.3}/sglang/bench_server_latency.py +0 -0
  91. {sglang-0.3.2 → sglang-0.3.3}/sglang/check_env.py +0 -0
  92. {sglang-0.3.2 → sglang-0.3.3}/sglang/global_config.py +0 -0
  93. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/__init__.py +0 -0
  94. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/__init__.py +0 -0
  95. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/anthropic.py +0 -0
  96. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/base_backend.py +0 -0
  97. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/litellm.py +0 -0
  98. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/openai.py +0 -0
  99. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/vertexai.py +0 -0
  100. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/chat_template.py +0 -0
  101. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/choices.py +0 -0
  102. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/compiler.py +0 -0
  103. {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/tracer.py +0 -0
  104. {sglang-0.3.2 → sglang-0.3.3}/sglang/launch_server.py +0 -0
  105. {sglang-0.3.2 → sglang-0.3.3}/sglang/launch_server_llavavid.py +0 -0
  106. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/configs/__init__.py +0 -0
  107. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/configs/exaone.py +0 -0
  108. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/constrained/__init__.py +0 -0
  109. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/constrained/base_tool_cache.py +0 -0
  110. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/conversation.py +0 -0
  111. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/hf_transformers_utils.py +0 -0
  112. {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3/sglang/srt/layers/attention/triton_ops}/decode_attention.py +0 -0
  113. {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3/sglang/srt/layers/attention/triton_ops}/prefill_attention.py +0 -0
  114. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  115. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  116. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/fused_moe/layer.py +0 -0
  117. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/fused_moe/patch.py +0 -0
  118. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/linear.py +0 -0
  119. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/quantization/base_config.py +0 -0
  120. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/lora/lora_config.py +0 -0
  121. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  122. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  123. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/flush_cache.py +0 -0
  124. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/radix_cache.py +0 -0
  125. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mm_utils.py +0 -0
  126. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/mistral.py +0 -0
  127. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/yivl.py +0 -0
  128. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/openai_api/adapter.py +0 -0
  129. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/openai_api/protocol.py +0 -0
  130. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  131. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  132. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  133. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  134. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  135. {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  136. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/few_shot_gsm8k.py +0 -0
  137. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/run_eval.py +0 -0
  138. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_common.py +0 -0
  139. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_gpqa.py +0 -0
  140. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_humaneval.py +0 -0
  141. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_math.py +0 -0
  142. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_mgsm.py +0 -0
  143. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_mmlu.py +0 -0
  144. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  145. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/test_activation.py +0 -0
  146. {sglang-0.3.2 → sglang-0.3.3}/sglang/test/test_layernorm.py +0 -0
  147. {sglang-0.3.2 → sglang-0.3.3}/sglang/utils.py +0 -0
  148. {sglang-0.3.2 → sglang-0.3.3}/sglang.egg-info/dependency_links.txt +0 -0
  149. {sglang-0.3.2 → sglang-0.3.3}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -233,6 +233,7 @@ Requires-Dist: uvloop; extra == "srt"
233
233
  Requires-Dist: zmq; extra == "srt"
234
234
  Requires-Dist: vllm==0.5.5; extra == "srt"
235
235
  Requires-Dist: outlines>=0.0.44; extra == "srt"
236
+ Requires-Dist: modelscope; extra == "srt"
236
237
  Provides-Extra: openai
237
238
  Requires-Dist: openai>=1.0; extra == "openai"
238
239
  Requires-Dist: tiktoken; extra == "openai"
@@ -269,16 +270,11 @@ Requires-Dist: sglang[test]; extra == "dev"
269
270
 
270
271
  --------------------------------------------------------------------------------
271
272
 
272
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
273
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
273
274
 
274
- SGLang is a fast serving framework for large language models and vision language models.
275
- It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
276
- The core features include:
277
-
278
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
279
- - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
280
- - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
281
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
275
+ ## Upcoming Events
276
+ - [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
277
+ - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
282
278
 
283
279
  ## News
284
280
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -294,6 +290,16 @@ The core features include:
294
290
 
295
291
  </details>
296
292
 
293
+ ## About
294
+ SGLang is a fast serving framework for large language models and vision language models.
295
+ It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
296
+ The core features include:
297
+
298
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
299
+ - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
300
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
301
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
302
+
297
303
  ## Contents
298
304
  - [Install](#install)
299
305
  - [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
@@ -318,7 +324,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
324
  ### Method 2: From source
319
325
  ```
320
326
  # Use the last release branch
321
- git clone -b v0.3.2 https://github.com/sgl-project/sglang.git
327
+ git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
322
328
  cd sglang
323
329
 
324
330
  pip install --upgrade pip
@@ -339,7 +345,7 @@ docker run --gpus all \
339
345
  --env "HF_TOKEN=<secret>" \
340
346
  --ipc=host \
341
347
  lmsysorg/sglang:latest \
342
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
348
+ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
343
349
  ```
344
350
 
345
351
  ### Method 4: Using docker compose
@@ -379,7 +385,7 @@ resources:
379
385
  run: |
380
386
  conda deactivate
381
387
  python3 -m sglang.launch_server \
382
- --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
388
+ --model-path meta-llama/Llama-3.1-8B-Instruct \
383
389
  --host 0.0.0.0 \
384
390
  --port 30000
385
391
  ```
@@ -421,7 +427,8 @@ curl http://localhost:30000/generate \
421
427
  }
422
428
  }'
423
429
  ```
424
- Learn more about the argument format [here](docs/en/sampling_params.md).
430
+
431
+ Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
425
432
 
426
433
  ### OpenAI Compatible API
427
434
  In addition, the server supports OpenAI-compatible APIs.
@@ -460,7 +467,7 @@ response = client.embeddings.create(
460
467
  print(response)
461
468
  ```
462
469
 
463
- It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
470
+ It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
464
471
 
465
472
  ### Additional Server Arguments
466
473
  - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
@@ -481,10 +488,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
481
488
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
482
489
  ```
483
490
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
491
+ - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
484
492
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
485
493
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
486
494
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
487
- - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
495
+ - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
488
496
  ```
489
497
  # Node 0
490
498
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
@@ -499,9 +507,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
499
507
  - Llama / Llama 2 / Llama 3 / Llama 3.1
500
508
  - Mistral / Mixtral / Mistral NeMo
501
509
  - Gemma / Gemma 2
502
- - OLMoE
503
510
  - Qwen / Qwen 2 / Qwen 2 MoE
504
511
  - DeepSeek / DeepSeek 2
512
+ - OLMoE
505
513
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
506
514
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
507
515
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
@@ -523,7 +531,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
523
531
  - XVERSE / XVERSE MoE
524
532
  - SmolLM
525
533
 
526
-
527
534
  **Embedding Models**
528
535
 
529
536
  - e5-mistral
@@ -544,6 +551,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
544
551
  ```
545
552
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
546
553
  ```
554
+
555
+ Or start it by docker.
556
+ ```bash
557
+ docker run --gpus all \
558
+ -p 30000:30000 \
559
+ -v ~/.cache/modelscope:/root/.cache/modelscope \
560
+ --env "SGLANG_USE_MODELSCOPE=true" \
561
+ --ipc=host \
562
+ lmsysorg/sglang:latest \
563
+ python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
564
+ ```
547
565
 
548
566
  </details>
549
567
 
@@ -582,7 +600,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
582
600
  The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
583
601
 
584
602
  ### Quick Start
585
- The example below shows how to use sglang to answer a mulit-turn question.
603
+ The example below shows how to use sglang to answer a multi-turn question.
586
604
 
587
605
  #### Using Local Models
588
606
  First, launch a server with
@@ -11,16 +11,11 @@
11
11
 
12
12
  --------------------------------------------------------------------------------
13
13
 
14
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
14
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
15
15
 
16
- SGLang is a fast serving framework for large language models and vision language models.
17
- It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
18
- The core features include:
19
-
20
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
21
- - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
22
- - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
23
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
16
+ ## Upcoming Events
17
+ - [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
18
+ - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
24
19
 
25
20
  ## News
26
21
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -36,6 +31,16 @@ The core features include:
36
31
 
37
32
  </details>
38
33
 
34
+ ## About
35
+ SGLang is a fast serving framework for large language models and vision language models.
36
+ It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
37
+ The core features include:
38
+
39
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
40
+ - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
41
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
42
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
43
+
39
44
  ## Contents
40
45
  - [Install](#install)
41
46
  - [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
@@ -60,7 +65,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
60
65
  ### Method 2: From source
61
66
  ```
62
67
  # Use the last release branch
63
- git clone -b v0.3.2 https://github.com/sgl-project/sglang.git
68
+ git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
64
69
  cd sglang
65
70
 
66
71
  pip install --upgrade pip
@@ -81,7 +86,7 @@ docker run --gpus all \
81
86
  --env "HF_TOKEN=<secret>" \
82
87
  --ipc=host \
83
88
  lmsysorg/sglang:latest \
84
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
89
+ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
85
90
  ```
86
91
 
87
92
  ### Method 4: Using docker compose
@@ -121,7 +126,7 @@ resources:
121
126
  run: |
122
127
  conda deactivate
123
128
  python3 -m sglang.launch_server \
124
- --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
129
+ --model-path meta-llama/Llama-3.1-8B-Instruct \
125
130
  --host 0.0.0.0 \
126
131
  --port 30000
127
132
  ```
@@ -163,7 +168,8 @@ curl http://localhost:30000/generate \
163
168
  }
164
169
  }'
165
170
  ```
166
- Learn more about the argument format [here](docs/en/sampling_params.md).
171
+
172
+ Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
167
173
 
168
174
  ### OpenAI Compatible API
169
175
  In addition, the server supports OpenAI-compatible APIs.
@@ -202,7 +208,7 @@ response = client.embeddings.create(
202
208
  print(response)
203
209
  ```
204
210
 
205
- It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
211
+ It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
206
212
 
207
213
  ### Additional Server Arguments
208
214
  - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
@@ -223,10 +229,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
223
229
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
224
230
  ```
225
231
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
232
+ - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
226
233
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
227
234
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
228
235
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
229
- - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
236
+ - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
230
237
  ```
231
238
  # Node 0
232
239
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
@@ -241,9 +248,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
241
248
  - Llama / Llama 2 / Llama 3 / Llama 3.1
242
249
  - Mistral / Mixtral / Mistral NeMo
243
250
  - Gemma / Gemma 2
244
- - OLMoE
245
251
  - Qwen / Qwen 2 / Qwen 2 MoE
246
252
  - DeepSeek / DeepSeek 2
253
+ - OLMoE
247
254
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
248
255
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
249
256
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
@@ -265,7 +272,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
265
272
  - XVERSE / XVERSE MoE
266
273
  - SmolLM
267
274
 
268
-
269
275
  **Embedding Models**
270
276
 
271
277
  - e5-mistral
@@ -286,6 +292,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
286
292
  ```
287
293
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
288
294
  ```
295
+
296
+ Or start it by docker.
297
+ ```bash
298
+ docker run --gpus all \
299
+ -p 30000:30000 \
300
+ -v ~/.cache/modelscope:/root/.cache/modelscope \
301
+ --env "SGLANG_USE_MODELSCOPE=true" \
302
+ --ipc=host \
303
+ lmsysorg/sglang:latest \
304
+ python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
305
+ ```
289
306
 
290
307
  </details>
291
308
 
@@ -324,7 +341,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
324
341
  The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
325
342
 
326
343
  ### Quick Start
327
- The example below shows how to use sglang to answer a mulit-turn question.
344
+ The example below shows how to use sglang to answer a multi-turn question.
328
345
 
329
346
  #### Using Local Models
330
347
  First, launch a server with
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.2"
7
+ version = "0.3.3"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -23,7 +23,7 @@ dependencies = [
23
23
  srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
24
  "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
25
  "torch", "torchao", "uvicorn", "uvloop", "zmq",
26
- "vllm==0.5.5", "outlines>=0.0.44"]
26
+ "vllm==0.5.5", "outlines>=0.0.44", "modelscope"]
27
27
  openai = ["openai>=1.0", "tiktoken"]
28
28
  anthropic = ["anthropic>=0.20.0"]
29
29
  litellm = ["litellm>=1.0.0"]
@@ -1,6 +1,7 @@
1
1
  # SGL API Components
2
2
 
3
3
  from sglang.api import (
4
+ Engine,
4
5
  Runtime,
5
6
  assistant,
6
7
  assistant_begin,
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
31
32
  # SGLang DSL APIs
32
33
  __all__ = [
33
34
  "Runtime",
35
+ "Engine",
34
36
  "assistant",
35
37
  "assistant_begin",
36
38
  "assistant_end",
@@ -33,13 +33,23 @@ def function(
33
33
 
34
34
 
35
35
  def Runtime(*args, **kwargs):
36
- # Avoid importing unnecessary dependency
37
36
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
37
+
38
+ # Avoid importing unnecessary dependency
38
39
  from sglang.srt.server import Runtime
39
40
 
40
41
  return Runtime(*args, **kwargs)
41
42
 
42
43
 
44
+ def Engine(*args, **kwargs):
45
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
46
+
47
+ # Avoid importing unnecessary dependency
48
+ from sglang.srt.server import Engine
49
+
50
+ return Engine(*args, **kwargs)
51
+
52
+
43
53
  def set_default_backend(backend: BaseBackend):
44
54
  global_config.default_backend = backend
45
55
 
@@ -48,6 +58,10 @@ def flush_cache(backend: Optional[BaseBackend] = None):
48
58
  backend = backend or global_config.default_backend
49
59
  if backend is None:
50
60
  return False
61
+
62
+ # If backend is Runtime
63
+ if hasattr(backend, "endpoint"):
64
+ backend = backend.endpoint
51
65
  return backend.flush_cache()
52
66
 
53
67
 
@@ -55,12 +69,17 @@ def get_server_args(backend: Optional[BaseBackend] = None):
55
69
  backend = backend or global_config.default_backend
56
70
  if backend is None:
57
71
  return None
72
+
73
+ # If backend is Runtime
74
+ if hasattr(backend, "endpoint"):
75
+ backend = backend.endpoint
58
76
  return backend.get_server_args()
59
77
 
60
78
 
61
79
  def gen(
62
80
  name: Optional[str] = None,
63
81
  max_tokens: Optional[int] = None,
82
+ min_tokens: Optional[int] = None,
64
83
  stop: Optional[Union[str, List[str]]] = None,
65
84
  stop_token_ids: Optional[List[int]] = None,
66
85
  temperature: Optional[float] = None,
@@ -100,6 +119,7 @@ def gen(
100
119
  return SglGen(
101
120
  name,
102
121
  max_tokens,
122
+ min_tokens,
103
123
  stop,
104
124
  stop_token_ids,
105
125
  temperature,
@@ -139,6 +159,7 @@ def gen_int(
139
159
  return SglGen(
140
160
  name,
141
161
  max_tokens,
162
+ None,
142
163
  stop,
143
164
  stop_token_ids,
144
165
  temperature,
@@ -177,6 +198,7 @@ def gen_string(
177
198
  return SglGen(
178
199
  name,
179
200
  max_tokens,
201
+ None,
180
202
  stop,
181
203
  stop_token_ids,
182
204
  temperature,
@@ -47,6 +47,7 @@ I'm going to the park
47
47
  import argparse
48
48
  import dataclasses
49
49
  import itertools
50
+ import json
50
51
  import logging
51
52
  import multiprocessing
52
53
  import os
@@ -62,10 +63,11 @@ import torch.distributed as dist
62
63
  from sglang.srt.configs.model_config import ModelConfig
63
64
  from sglang.srt.hf_transformers_utils import get_tokenizer
64
65
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
66
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
65
67
  from sglang.srt.model_executor.model_runner import ModelRunner
66
68
  from sglang.srt.sampling.sampling_params import SamplingParams
67
69
  from sglang.srt.server import _set_envs_and_config
68
- from sglang.srt.server_args import ServerArgs
70
+ from sglang.srt.server_args import PortArgs, ServerArgs
69
71
  from sglang.srt.utils import (
70
72
  configure_logger,
71
73
  kill_child_process,
@@ -121,7 +123,7 @@ class BenchArgs:
121
123
  )
122
124
 
123
125
 
124
- def load_model(server_args, tp_rank):
126
+ def load_model(server_args, port_args, tp_rank):
125
127
  suppress_other_loggers()
126
128
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
127
129
 
@@ -129,6 +131,7 @@ def load_model(server_args, tp_rank):
129
131
  server_args.model_path,
130
132
  server_args.trust_remote_code,
131
133
  context_length=server_args.context_length,
134
+ model_override_args=json.loads(server_args.json_model_override_args),
132
135
  )
133
136
  model_runner = ModelRunner(
134
137
  model_config=model_config,
@@ -136,7 +139,7 @@ def load_model(server_args, tp_rank):
136
139
  gpu_id=tp_rank,
137
140
  tp_rank=tp_rank,
138
141
  tp_size=server_args.tp_size,
139
- nccl_port=28888,
142
+ nccl_port=port_args.nccl_ports[0],
140
143
  server_args=server_args,
141
144
  )
142
145
  rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
@@ -167,9 +170,13 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
167
170
  assert len(input_ids[i]) > bench_args.cut_len
168
171
 
169
172
  tmp_input_ids = input_ids[i][: bench_args.cut_len]
170
- req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
173
+ req = Req(
174
+ rid=i,
175
+ origin_input_text=prompts[i],
176
+ origin_input_ids=tmp_input_ids,
177
+ sampling_params=sampling_params,
178
+ )
171
179
  req.prefix_indices = []
172
- req.sampling_params = sampling_params
173
180
  req.fill_ids = req.origin_input_ids
174
181
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
175
182
  reqs.append(req)
@@ -199,9 +206,13 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
199
206
 
200
207
  reqs = []
201
208
  for i in range(len(input_ids)):
202
- req = Req(rid=i, origin_input_text="", origin_input_ids=list(input_ids[i]))
209
+ req = Req(
210
+ rid=i,
211
+ origin_input_text="",
212
+ origin_input_ids=list(input_ids[i]),
213
+ sampling_params=sampling_params,
214
+ )
203
215
  req.prefix_indices = []
204
- req.sampling_params = sampling_params
205
216
  req.fill_ids = req.origin_input_ids
206
217
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
207
218
  reqs.append(req)
@@ -217,28 +228,33 @@ def extend(reqs, model_runner):
217
228
  tree_cache=None,
218
229
  )
219
230
  batch.prepare_for_extend(model_runner.model_config.vocab_size)
220
- logits_output = model_runner.forward(batch)
221
- next_token_ids = model_runner.sample(logits_output, batch).tolist()
231
+ model_worker_batch = batch.get_model_worker_batch()
232
+ forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
233
+ logits_output = model_runner.forward(forward_batch)
234
+ next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
222
235
  return next_token_ids, logits_output.next_token_logits, batch
223
236
 
224
237
 
225
238
  def decode(input_token_ids, batch, model_runner):
226
239
  batch.prepare_for_decode(input_token_ids)
227
- logits_output = model_runner.forward(batch)
228
- next_token_ids = model_runner.sample(logits_output, batch).tolist()
240
+ model_worker_batch = batch.get_model_worker_batch()
241
+ forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
242
+ logits_output = model_runner.forward(forward_batch)
243
+ next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
229
244
  return next_token_ids, logits_output.next_token_logits
230
245
 
231
246
 
232
247
  @torch.inference_mode()
233
248
  def correctness_test(
234
249
  server_args,
250
+ port_args,
235
251
  bench_args,
236
252
  tp_rank,
237
253
  ):
238
254
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
239
255
 
240
256
  # Load the model
241
- model_runner, tokenizer = load_model(server_args, tp_rank)
257
+ model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
242
258
 
243
259
  # Prepare inputs
244
260
  input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
@@ -324,13 +340,16 @@ def latency_test_run_once(
324
340
  rank_print(
325
341
  f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
326
342
  )
327
- med_decode_latency = np.median(decode_latencies)
328
- med_decode_throughput = batch_size / med_decode_latency
329
- rank_print(
330
- f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
331
- )
332
- measurement_results["median_decode_latency"] = med_decode_latency
333
- measurement_results["median_decode_throughput"] = med_decode_throughput
343
+
344
+ # record decode timing from 2nd output
345
+ if output_len > 1:
346
+ med_decode_latency = np.median(decode_latencies)
347
+ med_decode_throughput = batch_size / med_decode_latency
348
+ rank_print(
349
+ f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
350
+ )
351
+ measurement_results["median_decode_latency"] = med_decode_latency
352
+ measurement_results["median_decode_throughput"] = med_decode_throughput
334
353
 
335
354
  throughput = (input_len + output_len) * batch_size / tot_latency
336
355
  rank_print(
@@ -343,15 +362,15 @@ def latency_test_run_once(
343
362
 
344
363
  def latency_test(
345
364
  server_args,
365
+ port_args,
346
366
  bench_args,
347
367
  tp_rank,
348
368
  ):
349
369
  configure_logger(server_args, prefix=f" TP{tp_rank}")
350
- _set_envs_and_config(server_args)
351
370
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
352
371
 
353
372
  # Load the model
354
- model_runner, tokenizer = load_model(server_args, tp_rank)
373
+ model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
355
374
 
356
375
  # Prepare inputs for warm up
357
376
  reqs = prepare_synthetic_inputs_for_latency_test(
@@ -367,7 +386,7 @@ def latency_test(
367
386
  reqs,
368
387
  bench_args.batch_size[0],
369
388
  bench_args.input_len[0],
370
- 4, # shorter decoding to speed up the warmup
389
+ 8, # shorter decoding to speed up the warmup
371
390
  )
372
391
  rank_print("Benchmark ...")
373
392
 
@@ -453,6 +472,7 @@ def plot_latency_test(
453
472
 
454
473
 
455
474
  def main(server_args, bench_args):
475
+ _set_envs_and_config(server_args)
456
476
 
457
477
  if server_args.model_path:
458
478
  if bench_args.correctness_test:
@@ -468,8 +488,10 @@ def main(server_args, bench_args):
468
488
  "provide --result-filename for plotting the results"
469
489
  )
470
490
 
491
+ port_args = PortArgs.init_new(server_args)
492
+
471
493
  if server_args.tp_size == 1:
472
- work_func(server_args, bench_args, 0)
494
+ work_func(server_args, port_args, bench_args, 0)
473
495
  else:
474
496
  workers = []
475
497
  for tp_rank in range(server_args.tp_size):
@@ -477,6 +499,7 @@ def main(server_args, bench_args):
477
499
  target=work_func,
478
500
  args=(
479
501
  server_args,
502
+ port_args,
480
503
  bench_args,
481
504
  tp_rank,
482
505
  ),
@@ -503,8 +526,6 @@ if __name__ == "__main__":
503
526
  format="%(message)s",
504
527
  )
505
528
 
506
- multiprocessing.set_start_method("spawn", force=True)
507
-
508
529
  try:
509
530
  main(server_args, bench_args)
510
531
  except Exception as e:
@@ -845,6 +845,7 @@ def run_benchmark(args_: argparse.Namespace):
845
845
  tokenizer = get_tokenizer(tokenizer_id)
846
846
 
847
847
  if args.dataset_name == "sharegpt":
848
+ assert args.random_input_len is None and args.random_output_len is None
848
849
  input_requests = sample_sharegpt_requests(
849
850
  dataset_path=args.dataset_path,
850
851
  num_requests=args.num_prompts,
@@ -852,6 +853,7 @@ def run_benchmark(args_: argparse.Namespace):
852
853
  fixed_output_len=args.sharegpt_output_len,
853
854
  )
854
855
  elif args.dataset_name == "random":
856
+ assert args.random_input_len is not None and args.random_output_len is not None
855
857
  input_requests = sample_random_requests(
856
858
  input_len=args.random_input_len,
857
859
  output_len=args.random_output_len,
@@ -964,13 +966,11 @@ if __name__ == "__main__":
964
966
  parser.add_argument(
965
967
  "--random-input-len",
966
968
  type=int,
967
- default=1024,
968
969
  help="Number of input tokens per request, used only for random dataset.",
969
970
  )
970
971
  parser.add_argument(
971
972
  "--random-output-len",
972
973
  type=int,
973
- default=128,
974
974
  help="Number of output tokens per request, used only for random dataset.",
975
975
  )
976
976
  parser.add_argument(