sglang 0.3.2__tar.gz → 0.3.3.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. {sglang-0.3.2/sglang.egg-info → sglang-0.3.3.post1}/PKG-INFO +46 -21
  2. {sglang-0.3.2 → sglang-0.3.3.post1}/README.md +44 -20
  3. {sglang-0.3.2 → sglang-0.3.3.post1}/pyproject.toml +2 -2
  4. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/__init__.py +2 -0
  5. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/api.py +23 -1
  6. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/bench_latency.py +48 -27
  7. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/bench_serving.py +2 -2
  8. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/runtime_endpoint.py +14 -1
  9. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/interpreter.py +16 -6
  10. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/ir.py +20 -4
  11. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/configs/model_config.py +11 -9
  12. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/constrained/fsm_cache.py +9 -1
  13. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/constrained/jump_forward.py +15 -2
  14. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/conversation.py +11 -2
  15. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/activation.py +4 -4
  16. sglang-0.3.3.post1/sglang/srt/layers/attention/__init__.py +49 -0
  17. sglang-0.3.3.post1/sglang/srt/layers/attention/flashinfer_backend.py +277 -0
  18. {sglang-0.3.2/sglang/srt/layers → sglang-0.3.3.post1/sglang/srt/layers/attention}/flashinfer_utils.py +82 -80
  19. sglang-0.3.3.post1/sglang/srt/layers/attention/triton_backend.py +161 -0
  20. {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3.post1/sglang/srt/layers/attention/triton_ops}/extend_attention.py +3 -1
  21. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/layernorm.py +4 -4
  22. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/logits_processor.py +19 -15
  23. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/pooler.py +3 -3
  24. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/quantization/__init__.py +0 -2
  25. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/radix_attention.py +6 -4
  26. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/sampler.py +6 -4
  27. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/torchao_utils.py +18 -0
  28. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/lora/lora.py +20 -21
  29. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/lora/lora_manager.py +97 -25
  30. sglang-0.3.3.post1/sglang/srt/managers/data_parallel_controller.py +177 -0
  31. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/managers/detokenizer_manager.py +31 -18
  32. sglang-0.3.3.post1/sglang/srt/managers/image_processor.py +187 -0
  33. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/managers/io_struct.py +105 -76
  34. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/managers/schedule_batch.py +190 -63
  35. sglang-0.3.2/sglang/srt/managers/policy_scheduler.py → sglang-0.3.3.post1/sglang/srt/managers/schedule_policy.py +31 -21
  36. sglang-0.3.2/sglang/srt/managers/tp_worker.py → sglang-0.3.3.post1/sglang/srt/managers/scheduler.py +420 -383
  37. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/managers/tokenizer_manager.py +129 -248
  38. sglang-0.3.3.post1/sglang/srt/managers/tp_worker.py +128 -0
  39. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/memory_pool.py +34 -52
  40. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/model_executor/cuda_graph_runner.py +15 -19
  41. sglang-0.3.3.post1/sglang/srt/model_executor/forward_batch_info.py +173 -0
  42. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/model_executor/model_runner.py +111 -105
  43. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/baichuan.py +10 -10
  44. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/chatglm.py +12 -12
  45. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/commandr.py +10 -10
  46. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/dbrx.py +12 -12
  47. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/deepseek.py +10 -10
  48. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/deepseek_v2.py +14 -15
  49. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/exaone.py +10 -10
  50. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/gemma.py +10 -10
  51. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/gemma2.py +11 -11
  52. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/gpt_bigcode.py +10 -10
  53. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/grok.py +10 -10
  54. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/internlm2.py +10 -10
  55. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llama.py +14 -10
  56. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llama_classification.py +5 -5
  57. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llama_embedding.py +4 -4
  58. sglang-0.3.3.post1/sglang/srt/models/llama_reward.py +142 -0
  59. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llava.py +39 -33
  60. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llavavid.py +31 -28
  61. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/minicpm.py +10 -10
  62. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/minicpm3.py +14 -15
  63. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/mixtral.py +10 -10
  64. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/mixtral_quant.py +10 -10
  65. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/olmoe.py +10 -10
  66. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/qwen.py +10 -10
  67. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/qwen2.py +11 -11
  68. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/qwen2_moe.py +10 -10
  69. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/stablelm.py +10 -10
  70. sglang-0.3.3.post1/sglang/srt/models/torch_native_llama.py +506 -0
  71. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/xverse.py +10 -10
  72. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/xverse_moe.py +10 -10
  73. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/openai_api/adapter.py +5 -3
  74. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/sampling_batch_info.py +54 -33
  75. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/sampling_params.py +3 -1
  76. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/server.py +203 -117
  77. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/server_args.py +59 -29
  78. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/utils.py +127 -139
  79. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/runners.py +71 -26
  80. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/test_programs.py +38 -5
  81. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/test_utils.py +18 -9
  82. sglang-0.3.3.post1/sglang/version.py +1 -0
  83. {sglang-0.3.2 → sglang-0.3.3.post1/sglang.egg-info}/PKG-INFO +46 -21
  84. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang.egg-info/SOURCES.txt +13 -8
  85. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang.egg-info/requires.txt +1 -0
  86. sglang-0.3.2/sglang/srt/layers/attention_backend.py +0 -474
  87. sglang-0.3.2/sglang/srt/managers/controller_multi.py +0 -207
  88. sglang-0.3.2/sglang/srt/managers/controller_single.py +0 -164
  89. sglang-0.3.2/sglang/srt/model_executor/forward_batch_info.py +0 -174
  90. sglang-0.3.2/sglang/version.py +0 -1
  91. {sglang-0.3.2 → sglang-0.3.3.post1}/LICENSE +0 -0
  92. {sglang-0.3.2 → sglang-0.3.3.post1}/setup.cfg +0 -0
  93. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/bench_server_latency.py +0 -0
  94. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/check_env.py +0 -0
  95. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/global_config.py +0 -0
  96. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/__init__.py +0 -0
  97. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/__init__.py +0 -0
  98. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/anthropic.py +0 -0
  99. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/base_backend.py +0 -0
  100. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/litellm.py +0 -0
  101. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/openai.py +0 -0
  102. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/vertexai.py +0 -0
  103. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/chat_template.py +0 -0
  104. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/choices.py +0 -0
  105. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/compiler.py +0 -0
  106. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/tracer.py +0 -0
  107. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/launch_server.py +0 -0
  108. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/launch_server_llavavid.py +0 -0
  109. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/configs/__init__.py +0 -0
  110. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/configs/exaone.py +0 -0
  111. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/constrained/__init__.py +0 -0
  112. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/constrained/base_tool_cache.py +0 -0
  113. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  114. {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3.post1/sglang/srt/layers/attention/triton_ops}/decode_attention.py +0 -0
  115. {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3.post1/sglang/srt/layers/attention/triton_ops}/prefill_attention.py +0 -0
  116. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  117. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  118. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/layer.py +0 -0
  119. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/patch.py +0 -0
  120. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/linear.py +0 -0
  121. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  122. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/lora/lora_config.py +0 -0
  123. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  124. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  125. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  126. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  127. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mm_utils.py +0 -0
  128. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/mistral.py +0 -0
  129. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/yivl.py +0 -0
  130. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/openai_api/protocol.py +0 -0
  131. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  132. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  133. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  134. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  135. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  136. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  137. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  138. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/run_eval.py +0 -0
  139. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_common.py +0 -0
  140. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  141. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  142. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_math.py +0 -0
  143. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  144. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  145. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  146. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/test_activation.py +0 -0
  147. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/test_layernorm.py +0 -0
  148. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/utils.py +0 -0
  149. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang.egg-info/dependency_links.txt +0 -0
  150. {sglang-0.3.2 → sglang-0.3.3.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.2
3
+ Version: 0.3.3.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -233,6 +233,7 @@ Requires-Dist: uvloop; extra == "srt"
233
233
  Requires-Dist: zmq; extra == "srt"
234
234
  Requires-Dist: vllm==0.5.5; extra == "srt"
235
235
  Requires-Dist: outlines>=0.0.44; extra == "srt"
236
+ Requires-Dist: modelscope; extra == "srt"
236
237
  Provides-Extra: openai
237
238
  Requires-Dist: openai>=1.0; extra == "openai"
238
239
  Requires-Dist: tiktoken; extra == "openai"
@@ -256,8 +257,8 @@ Provides-Extra: dev
256
257
  Requires-Dist: sglang[all]; extra == "dev"
257
258
  Requires-Dist: sglang[test]; extra == "dev"
258
259
 
259
- <div align="center">
260
- <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
260
+ <div align="center" id="sglangtop">
261
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
261
262
 
262
263
  [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
263
264
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -269,16 +270,10 @@ Requires-Dist: sglang[test]; extra == "dev"
269
270
 
270
271
  --------------------------------------------------------------------------------
271
272
 
272
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
273
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
273
274
 
274
- SGLang is a fast serving framework for large language models and vision language models.
275
- It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
276
- The core features include:
277
-
278
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
279
- - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
280
- - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
281
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
275
+ ## Upcoming Events
276
+ - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
282
277
 
283
278
  ## News
284
279
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -294,6 +289,16 @@ The core features include:
294
289
 
295
290
  </details>
296
291
 
292
+ ## About
293
+ SGLang is a fast serving framework for large language models and vision language models.
294
+ It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
295
+ The core features include:
296
+
297
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
298
+ - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
299
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
300
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
301
+
297
302
  ## Contents
298
303
  - [Install](#install)
299
304
  - [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
@@ -318,7 +323,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
323
  ### Method 2: From source
319
324
  ```
320
325
  # Use the last release branch
321
- git clone -b v0.3.2 https://github.com/sgl-project/sglang.git
326
+ git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
322
327
  cd sglang
323
328
 
324
329
  pip install --upgrade pip
@@ -339,7 +344,7 @@ docker run --gpus all \
339
344
  --env "HF_TOKEN=<secret>" \
340
345
  --ipc=host \
341
346
  lmsysorg/sglang:latest \
342
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
347
+ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
343
348
  ```
344
349
 
345
350
  ### Method 4: Using docker compose
@@ -379,7 +384,7 @@ resources:
379
384
  run: |
380
385
  conda deactivate
381
386
  python3 -m sglang.launch_server \
382
- --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
387
+ --model-path meta-llama/Llama-3.1-8B-Instruct \
383
388
  --host 0.0.0.0 \
384
389
  --port 30000
385
390
  ```
@@ -421,7 +426,8 @@ curl http://localhost:30000/generate \
421
426
  }
422
427
  }'
423
428
  ```
424
- Learn more about the argument format [here](docs/en/sampling_params.md).
429
+
430
+ Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
425
431
 
426
432
  ### OpenAI Compatible API
427
433
  In addition, the server supports OpenAI-compatible APIs.
@@ -460,7 +466,7 @@ response = client.embeddings.create(
460
466
  print(response)
461
467
  ```
462
468
 
463
- It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
469
+ It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
464
470
 
465
471
  ### Additional Server Arguments
466
472
  - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
@@ -481,10 +487,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
481
487
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
482
488
  ```
483
489
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
490
+ - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
484
491
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
485
492
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
486
493
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
487
- - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
494
+ - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
488
495
  ```
489
496
  # Node 0
490
497
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
@@ -499,9 +506,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
499
506
  - Llama / Llama 2 / Llama 3 / Llama 3.1
500
507
  - Mistral / Mixtral / Mistral NeMo
501
508
  - Gemma / Gemma 2
502
- - OLMoE
503
509
  - Qwen / Qwen 2 / Qwen 2 MoE
504
510
  - DeepSeek / DeepSeek 2
511
+ - OLMoE
505
512
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
506
513
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
507
514
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
@@ -523,7 +530,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
523
530
  - XVERSE / XVERSE MoE
524
531
  - SmolLM
525
532
 
526
-
527
533
  **Embedding Models**
528
534
 
529
535
  - e5-mistral
@@ -544,6 +550,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
544
550
  ```
545
551
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
546
552
  ```
553
+
554
+ Or start it by docker.
555
+ ```bash
556
+ docker run --gpus all \
557
+ -p 30000:30000 \
558
+ -v ~/.cache/modelscope:/root/.cache/modelscope \
559
+ --env "SGLANG_USE_MODELSCOPE=true" \
560
+ --ipc=host \
561
+ lmsysorg/sglang:latest \
562
+ python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
563
+ ```
547
564
 
548
565
  </details>
549
566
 
@@ -582,7 +599,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
582
599
  The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
583
600
 
584
601
  ### Quick Start
585
- The example below shows how to use sglang to answer a mulit-turn question.
602
+ The example below shows how to use sglang to answer a multi-turn question.
586
603
 
587
604
  #### Using Local Models
588
605
  First, launch a server with
@@ -830,3 +847,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
830
847
  ## Citation And Acknowledgment
831
848
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
832
849
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
850
+
851
+
852
+
853
+ <p align="center">
854
+ <a href="#sglangtop" target="_blank">
855
+ <bold>Back To Top </bold>
856
+ </a>
857
+ </p>
@@ -1,5 +1,5 @@
1
- <div align="center">
2
- <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
1
+ <div align="center" id="sglangtop">
2
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
3
3
 
4
4
  [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
5
5
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -11,16 +11,10 @@
11
11
 
12
12
  --------------------------------------------------------------------------------
13
13
 
14
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
14
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
15
15
 
16
- SGLang is a fast serving framework for large language models and vision language models.
17
- It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
18
- The core features include:
19
-
20
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
21
- - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
22
- - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
23
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
16
+ ## Upcoming Events
17
+ - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
24
18
 
25
19
  ## News
26
20
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -36,6 +30,16 @@ The core features include:
36
30
 
37
31
  </details>
38
32
 
33
+ ## About
34
+ SGLang is a fast serving framework for large language models and vision language models.
35
+ It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
36
+ The core features include:
37
+
38
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
39
+ - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
40
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
41
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
42
+
39
43
  ## Contents
40
44
  - [Install](#install)
41
45
  - [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
@@ -60,7 +64,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
60
64
  ### Method 2: From source
61
65
  ```
62
66
  # Use the last release branch
63
- git clone -b v0.3.2 https://github.com/sgl-project/sglang.git
67
+ git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
64
68
  cd sglang
65
69
 
66
70
  pip install --upgrade pip
@@ -81,7 +85,7 @@ docker run --gpus all \
81
85
  --env "HF_TOKEN=<secret>" \
82
86
  --ipc=host \
83
87
  lmsysorg/sglang:latest \
84
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
88
+ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
85
89
  ```
86
90
 
87
91
  ### Method 4: Using docker compose
@@ -121,7 +125,7 @@ resources:
121
125
  run: |
122
126
  conda deactivate
123
127
  python3 -m sglang.launch_server \
124
- --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
128
+ --model-path meta-llama/Llama-3.1-8B-Instruct \
125
129
  --host 0.0.0.0 \
126
130
  --port 30000
127
131
  ```
@@ -163,7 +167,8 @@ curl http://localhost:30000/generate \
163
167
  }
164
168
  }'
165
169
  ```
166
- Learn more about the argument format [here](docs/en/sampling_params.md).
170
+
171
+ Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
167
172
 
168
173
  ### OpenAI Compatible API
169
174
  In addition, the server supports OpenAI-compatible APIs.
@@ -202,7 +207,7 @@ response = client.embeddings.create(
202
207
  print(response)
203
208
  ```
204
209
 
205
- It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
210
+ It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
206
211
 
207
212
  ### Additional Server Arguments
208
213
  - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
@@ -223,10 +228,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
223
228
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
224
229
  ```
225
230
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
231
+ - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
226
232
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
227
233
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
228
234
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
229
- - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
235
+ - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
230
236
  ```
231
237
  # Node 0
232
238
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
@@ -241,9 +247,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
241
247
  - Llama / Llama 2 / Llama 3 / Llama 3.1
242
248
  - Mistral / Mixtral / Mistral NeMo
243
249
  - Gemma / Gemma 2
244
- - OLMoE
245
250
  - Qwen / Qwen 2 / Qwen 2 MoE
246
251
  - DeepSeek / DeepSeek 2
252
+ - OLMoE
247
253
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
248
254
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
249
255
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
@@ -265,7 +271,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
265
271
  - XVERSE / XVERSE MoE
266
272
  - SmolLM
267
273
 
268
-
269
274
  **Embedding Models**
270
275
 
271
276
  - e5-mistral
@@ -286,6 +291,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
286
291
  ```
287
292
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
288
293
  ```
294
+
295
+ Or start it by docker.
296
+ ```bash
297
+ docker run --gpus all \
298
+ -p 30000:30000 \
299
+ -v ~/.cache/modelscope:/root/.cache/modelscope \
300
+ --env "SGLANG_USE_MODELSCOPE=true" \
301
+ --ipc=host \
302
+ lmsysorg/sglang:latest \
303
+ python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
304
+ ```
289
305
 
290
306
  </details>
291
307
 
@@ -324,7 +340,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
324
340
  The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
325
341
 
326
342
  ### Quick Start
327
- The example below shows how to use sglang to answer a mulit-turn question.
343
+ The example below shows how to use sglang to answer a multi-turn question.
328
344
 
329
345
  #### Using Local Models
330
346
  First, launch a server with
@@ -572,3 +588,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
572
588
  ## Citation And Acknowledgment
573
589
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
574
590
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
591
+
592
+
593
+
594
+ <p align="center">
595
+ <a href="#sglangtop" target="_blank">
596
+ <bold>Back To Top </bold>
597
+ </a>
598
+ </p>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.2"
7
+ version = "0.3.3.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -23,7 +23,7 @@ dependencies = [
23
23
  srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
24
  "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
25
  "torch", "torchao", "uvicorn", "uvloop", "zmq",
26
- "vllm==0.5.5", "outlines>=0.0.44"]
26
+ "vllm==0.5.5", "outlines>=0.0.44", "modelscope"]
27
27
  openai = ["openai>=1.0", "tiktoken"]
28
28
  anthropic = ["anthropic>=0.20.0"]
29
29
  litellm = ["litellm>=1.0.0"]
@@ -1,6 +1,7 @@
1
1
  # SGL API Components
2
2
 
3
3
  from sglang.api import (
4
+ Engine,
4
5
  Runtime,
5
6
  assistant,
6
7
  assistant_begin,
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
31
32
  # SGLang DSL APIs
32
33
  __all__ = [
33
34
  "Runtime",
35
+ "Engine",
34
36
  "assistant",
35
37
  "assistant_begin",
36
38
  "assistant_end",
@@ -33,13 +33,23 @@ def function(
33
33
 
34
34
 
35
35
  def Runtime(*args, **kwargs):
36
- # Avoid importing unnecessary dependency
37
36
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
37
+
38
+ # Avoid importing unnecessary dependency
38
39
  from sglang.srt.server import Runtime
39
40
 
40
41
  return Runtime(*args, **kwargs)
41
42
 
42
43
 
44
+ def Engine(*args, **kwargs):
45
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
46
+
47
+ # Avoid importing unnecessary dependency
48
+ from sglang.srt.server import Engine
49
+
50
+ return Engine(*args, **kwargs)
51
+
52
+
43
53
  def set_default_backend(backend: BaseBackend):
44
54
  global_config.default_backend = backend
45
55
 
@@ -48,6 +58,10 @@ def flush_cache(backend: Optional[BaseBackend] = None):
48
58
  backend = backend or global_config.default_backend
49
59
  if backend is None:
50
60
  return False
61
+
62
+ # If backend is Runtime
63
+ if hasattr(backend, "endpoint"):
64
+ backend = backend.endpoint
51
65
  return backend.flush_cache()
52
66
 
53
67
 
@@ -55,12 +69,17 @@ def get_server_args(backend: Optional[BaseBackend] = None):
55
69
  backend = backend or global_config.default_backend
56
70
  if backend is None:
57
71
  return None
72
+
73
+ # If backend is Runtime
74
+ if hasattr(backend, "endpoint"):
75
+ backend = backend.endpoint
58
76
  return backend.get_server_args()
59
77
 
60
78
 
61
79
  def gen(
62
80
  name: Optional[str] = None,
63
81
  max_tokens: Optional[int] = None,
82
+ min_tokens: Optional[int] = None,
64
83
  stop: Optional[Union[str, List[str]]] = None,
65
84
  stop_token_ids: Optional[List[int]] = None,
66
85
  temperature: Optional[float] = None,
@@ -100,6 +119,7 @@ def gen(
100
119
  return SglGen(
101
120
  name,
102
121
  max_tokens,
122
+ min_tokens,
103
123
  stop,
104
124
  stop_token_ids,
105
125
  temperature,
@@ -139,6 +159,7 @@ def gen_int(
139
159
  return SglGen(
140
160
  name,
141
161
  max_tokens,
162
+ None,
142
163
  stop,
143
164
  stop_token_ids,
144
165
  temperature,
@@ -177,6 +198,7 @@ def gen_string(
177
198
  return SglGen(
178
199
  name,
179
200
  max_tokens,
201
+ None,
180
202
  stop,
181
203
  stop_token_ids,
182
204
  temperature,
@@ -47,6 +47,7 @@ I'm going to the park
47
47
  import argparse
48
48
  import dataclasses
49
49
  import itertools
50
+ import json
50
51
  import logging
51
52
  import multiprocessing
52
53
  import os
@@ -62,10 +63,11 @@ import torch.distributed as dist
62
63
  from sglang.srt.configs.model_config import ModelConfig
63
64
  from sglang.srt.hf_transformers_utils import get_tokenizer
64
65
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
66
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
65
67
  from sglang.srt.model_executor.model_runner import ModelRunner
66
68
  from sglang.srt.sampling.sampling_params import SamplingParams
67
69
  from sglang.srt.server import _set_envs_and_config
68
- from sglang.srt.server_args import ServerArgs
70
+ from sglang.srt.server_args import PortArgs, ServerArgs
69
71
  from sglang.srt.utils import (
70
72
  configure_logger,
71
73
  kill_child_process,
@@ -121,7 +123,7 @@ class BenchArgs:
121
123
  )
122
124
 
123
125
 
124
- def load_model(server_args, tp_rank):
126
+ def load_model(server_args, port_args, tp_rank):
125
127
  suppress_other_loggers()
126
128
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
127
129
 
@@ -129,6 +131,7 @@ def load_model(server_args, tp_rank):
129
131
  server_args.model_path,
130
132
  server_args.trust_remote_code,
131
133
  context_length=server_args.context_length,
134
+ model_override_args=json.loads(server_args.json_model_override_args),
132
135
  )
133
136
  model_runner = ModelRunner(
134
137
  model_config=model_config,
@@ -136,7 +139,7 @@ def load_model(server_args, tp_rank):
136
139
  gpu_id=tp_rank,
137
140
  tp_rank=tp_rank,
138
141
  tp_size=server_args.tp_size,
139
- nccl_port=28888,
142
+ nccl_port=port_args.nccl_port,
140
143
  server_args=server_args,
141
144
  )
142
145
  rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
@@ -167,9 +170,13 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
167
170
  assert len(input_ids[i]) > bench_args.cut_len
168
171
 
169
172
  tmp_input_ids = input_ids[i][: bench_args.cut_len]
170
- req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
173
+ req = Req(
174
+ rid=i,
175
+ origin_input_text=prompts[i],
176
+ origin_input_ids=tmp_input_ids,
177
+ sampling_params=sampling_params,
178
+ )
171
179
  req.prefix_indices = []
172
- req.sampling_params = sampling_params
173
180
  req.fill_ids = req.origin_input_ids
174
181
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
175
182
  reqs.append(req)
@@ -199,9 +206,13 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
199
206
 
200
207
  reqs = []
201
208
  for i in range(len(input_ids)):
202
- req = Req(rid=i, origin_input_text="", origin_input_ids=list(input_ids[i]))
209
+ req = Req(
210
+ rid=i,
211
+ origin_input_text="",
212
+ origin_input_ids=list(input_ids[i]),
213
+ sampling_params=sampling_params,
214
+ )
203
215
  req.prefix_indices = []
204
- req.sampling_params = sampling_params
205
216
  req.fill_ids = req.origin_input_ids
206
217
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
207
218
  reqs.append(req)
@@ -209,6 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
209
220
  return reqs
210
221
 
211
222
 
223
+ @torch.inference_mode()
212
224
  def extend(reqs, model_runner):
213
225
  batch = ScheduleBatch.init_new(
214
226
  reqs=reqs,
@@ -217,28 +229,33 @@ def extend(reqs, model_runner):
217
229
  tree_cache=None,
218
230
  )
219
231
  batch.prepare_for_extend(model_runner.model_config.vocab_size)
220
- logits_output = model_runner.forward(batch)
221
- next_token_ids = model_runner.sample(logits_output, batch).tolist()
232
+ model_worker_batch = batch.get_model_worker_batch()
233
+ forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
234
+ logits_output = model_runner.forward(forward_batch)
235
+ next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
222
236
  return next_token_ids, logits_output.next_token_logits, batch
223
237
 
224
238
 
239
+ @torch.inference_mode()
225
240
  def decode(input_token_ids, batch, model_runner):
226
241
  batch.prepare_for_decode(input_token_ids)
227
- logits_output = model_runner.forward(batch)
228
- next_token_ids = model_runner.sample(logits_output, batch).tolist()
242
+ model_worker_batch = batch.get_model_worker_batch()
243
+ forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
244
+ logits_output = model_runner.forward(forward_batch)
245
+ next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
229
246
  return next_token_ids, logits_output.next_token_logits
230
247
 
231
248
 
232
- @torch.inference_mode()
233
249
  def correctness_test(
234
250
  server_args,
251
+ port_args,
235
252
  bench_args,
236
253
  tp_rank,
237
254
  ):
238
255
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
239
256
 
240
257
  # Load the model
241
- model_runner, tokenizer = load_model(server_args, tp_rank)
258
+ model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
242
259
 
243
260
  # Prepare inputs
244
261
  input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
@@ -271,7 +288,6 @@ def correctness_test(
271
288
  rank_print(tokenizer.decode(output_ids[i]), "\n")
272
289
 
273
290
 
274
- @torch.inference_mode()
275
291
  def latency_test_run_once(
276
292
  run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
277
293
  ):
@@ -324,13 +340,16 @@ def latency_test_run_once(
324
340
  rank_print(
325
341
  f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
326
342
  )
327
- med_decode_latency = np.median(decode_latencies)
328
- med_decode_throughput = batch_size / med_decode_latency
329
- rank_print(
330
- f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
331
- )
332
- measurement_results["median_decode_latency"] = med_decode_latency
333
- measurement_results["median_decode_throughput"] = med_decode_throughput
343
+
344
+ # record decode timing from 2nd output
345
+ if output_len > 1:
346
+ med_decode_latency = np.median(decode_latencies)
347
+ med_decode_throughput = batch_size / med_decode_latency
348
+ rank_print(
349
+ f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
350
+ )
351
+ measurement_results["median_decode_latency"] = med_decode_latency
352
+ measurement_results["median_decode_throughput"] = med_decode_throughput
334
353
 
335
354
  throughput = (input_len + output_len) * batch_size / tot_latency
336
355
  rank_print(
@@ -343,15 +362,15 @@ def latency_test_run_once(
343
362
 
344
363
  def latency_test(
345
364
  server_args,
365
+ port_args,
346
366
  bench_args,
347
367
  tp_rank,
348
368
  ):
349
369
  configure_logger(server_args, prefix=f" TP{tp_rank}")
350
- _set_envs_and_config(server_args)
351
370
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
352
371
 
353
372
  # Load the model
354
- model_runner, tokenizer = load_model(server_args, tp_rank)
373
+ model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
355
374
 
356
375
  # Prepare inputs for warm up
357
376
  reqs = prepare_synthetic_inputs_for_latency_test(
@@ -367,7 +386,7 @@ def latency_test(
367
386
  reqs,
368
387
  bench_args.batch_size[0],
369
388
  bench_args.input_len[0],
370
- 4, # shorter decoding to speed up the warmup
389
+ 8, # shorter decoding to speed up the warmup
371
390
  )
372
391
  rank_print("Benchmark ...")
373
392
 
@@ -453,6 +472,7 @@ def plot_latency_test(
453
472
 
454
473
 
455
474
  def main(server_args, bench_args):
475
+ _set_envs_and_config(server_args)
456
476
 
457
477
  if server_args.model_path:
458
478
  if bench_args.correctness_test:
@@ -468,8 +488,10 @@ def main(server_args, bench_args):
468
488
  "provide --result-filename for plotting the results"
469
489
  )
470
490
 
491
+ port_args = PortArgs.init_new(server_args)
492
+
471
493
  if server_args.tp_size == 1:
472
- work_func(server_args, bench_args, 0)
494
+ work_func(server_args, port_args, bench_args, 0)
473
495
  else:
474
496
  workers = []
475
497
  for tp_rank in range(server_args.tp_size):
@@ -477,6 +499,7 @@ def main(server_args, bench_args):
477
499
  target=work_func,
478
500
  args=(
479
501
  server_args,
502
+ port_args,
480
503
  bench_args,
481
504
  tp_rank,
482
505
  ),
@@ -503,8 +526,6 @@ if __name__ == "__main__":
503
526
  format="%(message)s",
504
527
  )
505
528
 
506
- multiprocessing.set_start_method("spawn", force=True)
507
-
508
529
  try:
509
530
  main(server_args, bench_args)
510
531
  except Exception as e: