sglang 0.3.3__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. {sglang-0.3.3/sglang.egg-info → sglang-0.3.4}/PKG-INFO +82 -32
  2. {sglang-0.3.3 → sglang-0.3.4}/README.md +51 -14
  3. {sglang-0.3.3 → sglang-0.3.4}/pyproject.toml +12 -5
  4. {sglang-0.3.3 → sglang-0.3.4}/sglang/bench_latency.py +31 -13
  5. {sglang-0.3.3 → sglang-0.3.4}/sglang/bench_server_latency.py +21 -10
  6. {sglang-0.3.3 → sglang-0.3.4}/sglang/bench_serving.py +101 -7
  7. {sglang-0.3.3 → sglang-0.3.4}/sglang/global_config.py +0 -1
  8. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/conversation.py +11 -2
  9. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/attention/__init__.py +27 -5
  10. sglang-0.3.4/sglang/srt/layers/attention/double_sparsity_backend.py +281 -0
  11. sglang-0.3.4/sglang/srt/layers/attention/flashinfer_backend.py +546 -0
  12. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/attention/triton_backend.py +6 -4
  13. sglang-0.3.4/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
  14. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
  15. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +4 -2
  16. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/sampler.py +6 -2
  17. sglang-0.3.4/sglang/srt/managers/data_parallel_controller.py +177 -0
  18. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/managers/detokenizer_manager.py +31 -10
  19. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/managers/io_struct.py +11 -2
  20. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/managers/schedule_batch.py +126 -43
  21. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/managers/schedule_policy.py +2 -1
  22. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/managers/scheduler.py +245 -142
  23. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/managers/tokenizer_manager.py +14 -1
  24. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/managers/tp_worker.py +111 -1
  25. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/mem_cache/chunk_cache.py +8 -4
  26. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/mem_cache/memory_pool.py +77 -4
  27. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/mem_cache/radix_cache.py +15 -7
  28. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/model_executor/cuda_graph_runner.py +4 -4
  29. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/model_executor/forward_batch_info.py +16 -21
  30. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/model_executor/model_runner.py +100 -36
  31. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/baichuan.py +2 -3
  32. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/chatglm.py +5 -6
  33. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/commandr.py +1 -2
  34. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/dbrx.py +1 -2
  35. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/deepseek.py +4 -5
  36. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/deepseek_v2.py +5 -6
  37. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/exaone.py +1 -2
  38. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/gemma.py +2 -2
  39. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/gemma2.py +5 -5
  40. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/gpt_bigcode.py +5 -5
  41. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/grok.py +1 -2
  42. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/internlm2.py +1 -2
  43. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/llama.py +1 -2
  44. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/llama_classification.py +1 -2
  45. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/llama_reward.py +2 -3
  46. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/llava.py +4 -8
  47. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/llavavid.py +1 -2
  48. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/minicpm.py +1 -2
  49. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/minicpm3.py +5 -6
  50. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/mixtral.py +1 -2
  51. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/mixtral_quant.py +1 -2
  52. sglang-0.3.4/sglang/srt/models/olmo.py +352 -0
  53. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/olmoe.py +1 -2
  54. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/qwen.py +1 -2
  55. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/qwen2.py +1 -2
  56. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/qwen2_moe.py +4 -5
  57. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/stablelm.py +1 -2
  58. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/torch_native_llama.py +1 -2
  59. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/xverse.py +1 -2
  60. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/xverse_moe.py +4 -5
  61. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/yivl.py +1 -2
  62. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/openai_api/adapter.py +97 -52
  63. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/openai_api/protocol.py +10 -2
  64. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
  65. sglang-0.3.4/sglang/srt/sampling/sampling_batch_info.py +226 -0
  66. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/sampling/sampling_params.py +2 -0
  67. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/server.py +171 -37
  68. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/server_args.py +127 -48
  69. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/utils.py +37 -14
  70. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/few_shot_gsm8k.py +4 -1
  71. sglang-0.3.4/sglang/test/few_shot_gsm8k_engine.py +144 -0
  72. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/srt/sampling/penaltylib/utils.py +16 -12
  73. sglang-0.3.4/sglang/version.py +1 -0
  74. {sglang-0.3.3 → sglang-0.3.4/sglang.egg-info}/PKG-INFO +82 -32
  75. {sglang-0.3.3 → sglang-0.3.4}/sglang.egg-info/SOURCES.txt +5 -1
  76. {sglang-0.3.3 → sglang-0.3.4}/sglang.egg-info/requires.txt +20 -3
  77. sglang-0.3.3/sglang/srt/layers/attention/flashinfer_backend.py +0 -277
  78. sglang-0.3.3/sglang/srt/layers/attention/flashinfer_utils.py +0 -237
  79. sglang-0.3.3/sglang/srt/sampling/sampling_batch_info.py +0 -180
  80. sglang-0.3.3/sglang/version.py +0 -1
  81. {sglang-0.3.3 → sglang-0.3.4}/LICENSE +0 -0
  82. {sglang-0.3.3 → sglang-0.3.4}/setup.cfg +0 -0
  83. {sglang-0.3.3 → sglang-0.3.4}/sglang/__init__.py +0 -0
  84. {sglang-0.3.3 → sglang-0.3.4}/sglang/api.py +0 -0
  85. {sglang-0.3.3 → sglang-0.3.4}/sglang/check_env.py +0 -0
  86. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/__init__.py +0 -0
  87. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/backend/__init__.py +0 -0
  88. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/backend/anthropic.py +0 -0
  89. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/backend/base_backend.py +0 -0
  90. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/backend/litellm.py +0 -0
  91. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/backend/openai.py +0 -0
  92. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/backend/runtime_endpoint.py +0 -0
  93. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/backend/vertexai.py +0 -0
  94. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/chat_template.py +0 -0
  95. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/choices.py +0 -0
  96. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/compiler.py +0 -0
  97. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/interpreter.py +0 -0
  98. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/ir.py +0 -0
  99. {sglang-0.3.3 → sglang-0.3.4}/sglang/lang/tracer.py +0 -0
  100. {sglang-0.3.3 → sglang-0.3.4}/sglang/launch_server.py +0 -0
  101. {sglang-0.3.3 → sglang-0.3.4}/sglang/launch_server_llavavid.py +0 -0
  102. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/configs/__init__.py +0 -0
  103. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/configs/exaone.py +0 -0
  104. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/configs/model_config.py +0 -0
  105. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/constrained/__init__.py +0 -0
  106. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/constrained/base_tool_cache.py +0 -0
  107. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/constrained/fsm_cache.py +0 -0
  108. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/constrained/jump_forward.py +0 -0
  109. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/hf_transformers_utils.py +0 -0
  110. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/activation.py +0 -0
  111. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  112. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  113. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  114. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/fused_moe/layer.py +0 -0
  115. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/fused_moe/patch.py +0 -0
  116. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/layernorm.py +0 -0
  117. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/linear.py +0 -0
  118. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/logits_processor.py +0 -0
  119. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/pooler.py +0 -0
  120. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/quantization/__init__.py +0 -0
  121. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/quantization/base_config.py +0 -0
  122. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/radix_attention.py +0 -0
  123. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/layers/torchao_utils.py +0 -0
  124. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/lora/lora.py +0 -0
  125. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/lora/lora_config.py +0 -0
  126. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/lora/lora_manager.py +0 -0
  127. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/managers/image_processor.py +0 -0
  128. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  129. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/mem_cache/flush_cache.py +0 -0
  130. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/mm_utils.py +0 -0
  131. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/llama_embedding.py +0 -0
  132. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/models/mistral.py +0 -0
  133. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  134. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  135. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  136. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  137. {sglang-0.3.3 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  138. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/run_eval.py +0 -0
  139. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/runners.py +0 -0
  140. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/simple_eval_common.py +0 -0
  141. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/simple_eval_gpqa.py +0 -0
  142. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/simple_eval_humaneval.py +0 -0
  143. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/simple_eval_math.py +0 -0
  144. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/simple_eval_mgsm.py +0 -0
  145. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/simple_eval_mmlu.py +0 -0
  146. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/test_activation.py +0 -0
  147. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/test_layernorm.py +0 -0
  148. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/test_programs.py +0 -0
  149. {sglang-0.3.3 → sglang-0.3.4}/sglang/test/test_utils.py +0 -0
  150. {sglang-0.3.3 → sglang-0.3.4}/sglang/utils.py +0 -0
  151. {sglang-0.3.3 → sglang-0.3.4}/sglang.egg-info/dependency_links.txt +0 -0
  152. {sglang-0.3.3 → sglang-0.3.4}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -214,26 +214,31 @@ License-File: LICENSE
214
214
  Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
216
  Requires-Dist: numpy
217
+ Provides-Extra: runtime-common
218
+ Requires-Dist: aiohttp; extra == "runtime-common"
219
+ Requires-Dist: decord; extra == "runtime-common"
220
+ Requires-Dist: fastapi; extra == "runtime-common"
221
+ Requires-Dist: hf_transfer; extra == "runtime-common"
222
+ Requires-Dist: huggingface_hub; extra == "runtime-common"
223
+ Requires-Dist: interegular; extra == "runtime-common"
224
+ Requires-Dist: orjson; extra == "runtime-common"
225
+ Requires-Dist: packaging; extra == "runtime-common"
226
+ Requires-Dist: pillow; extra == "runtime-common"
227
+ Requires-Dist: psutil; extra == "runtime-common"
228
+ Requires-Dist: pydantic; extra == "runtime-common"
229
+ Requires-Dist: python-multipart; extra == "runtime-common"
230
+ Requires-Dist: torchao; extra == "runtime-common"
231
+ Requires-Dist: uvicorn; extra == "runtime-common"
232
+ Requires-Dist: uvloop; extra == "runtime-common"
233
+ Requires-Dist: zmq; extra == "runtime-common"
234
+ Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
235
+ Requires-Dist: modelscope; extra == "runtime-common"
217
236
  Provides-Extra: srt
218
- Requires-Dist: aiohttp; extra == "srt"
219
- Requires-Dist: decord; extra == "srt"
220
- Requires-Dist: fastapi; extra == "srt"
221
- Requires-Dist: hf_transfer; extra == "srt"
222
- Requires-Dist: huggingface_hub; extra == "srt"
223
- Requires-Dist: interegular; extra == "srt"
224
- Requires-Dist: packaging; extra == "srt"
225
- Requires-Dist: pillow; extra == "srt"
226
- Requires-Dist: psutil; extra == "srt"
227
- Requires-Dist: pydantic; extra == "srt"
228
- Requires-Dist: python-multipart; extra == "srt"
237
+ Requires-Dist: sglang[runtime_common]; extra == "srt"
229
238
  Requires-Dist: torch; extra == "srt"
230
- Requires-Dist: torchao; extra == "srt"
231
- Requires-Dist: uvicorn; extra == "srt"
232
- Requires-Dist: uvloop; extra == "srt"
233
- Requires-Dist: zmq; extra == "srt"
234
239
  Requires-Dist: vllm==0.5.5; extra == "srt"
235
- Requires-Dist: outlines>=0.0.44; extra == "srt"
236
- Requires-Dist: modelscope; extra == "srt"
240
+ Provides-Extra: srt-xpu
241
+ Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
237
242
  Provides-Extra: openai
238
243
  Requires-Dist: openai>=1.0; extra == "openai"
239
244
  Requires-Dist: tiktoken; extra == "openai"
@@ -253,12 +258,20 @@ Requires-Dist: sglang[srt]; extra == "all"
253
258
  Requires-Dist: sglang[openai]; extra == "all"
254
259
  Requires-Dist: sglang[anthropic]; extra == "all"
255
260
  Requires-Dist: sglang[litellm]; extra == "all"
261
+ Provides-Extra: all-xpu
262
+ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
263
+ Requires-Dist: sglang[openai]; extra == "all-xpu"
264
+ Requires-Dist: sglang[anthropic]; extra == "all-xpu"
265
+ Requires-Dist: sglang[litellm]; extra == "all-xpu"
256
266
  Provides-Extra: dev
257
267
  Requires-Dist: sglang[all]; extra == "dev"
258
268
  Requires-Dist: sglang[test]; extra == "dev"
269
+ Provides-Extra: dev-xpu
270
+ Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
271
+ Requires-Dist: sglang[test]; extra == "dev-xpu"
259
272
 
260
- <div align="center">
261
- <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
273
+ <div align="center" id="sglangtop">
274
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
262
275
 
263
276
  [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
264
277
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -270,15 +283,13 @@ Requires-Dist: sglang[test]; extra == "dev"
270
283
 
271
284
  --------------------------------------------------------------------------------
272
285
 
273
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
274
-
275
- ## Upcoming Events
276
- - [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
277
- - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
286
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
287
+ [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
278
288
 
279
289
  ## News
280
- - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
281
- - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
290
+ - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
291
+ - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
292
+ - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
282
293
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
283
294
 
284
295
  <details>
@@ -324,7 +335,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
324
335
  ### Method 2: From source
325
336
  ```
326
337
  # Use the last release branch
327
- git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
338
+ git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
328
339
  cd sglang
329
340
 
330
341
  pip install --upgrade pip
@@ -501,6 +512,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
501
512
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
502
513
  ```
503
514
 
515
+ ### Engine Without HTTP Server
516
+
517
+ We also provide an inference engine **without a HTTP server**. For example,
518
+
519
+ ```python
520
+ import sglang as sgl
521
+
522
+
523
+ def main():
524
+ prompts = [
525
+ "Hello, my name is",
526
+ "The president of the United States is",
527
+ "The capital of France is",
528
+ "The future of AI is",
529
+ ]
530
+ sampling_params = {"temperature": 0.8, "top_p": 0.95}
531
+ llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
532
+
533
+ outputs = llm.generate(prompts, sampling_params)
534
+ for prompt, output in zip(prompts, outputs):
535
+ print("===============================")
536
+ print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
537
+
538
+ if __name__ == "__main__":
539
+ main()
540
+ ```
541
+
542
+ This can be used for:
543
+
544
+ 1. **Offline Batch Inference**
545
+ 2. **Building Custom Servers**
546
+
547
+ You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
548
+
504
549
  ### Supported Models
505
550
 
506
551
  **Generative Models**
@@ -837,10 +882,7 @@ def chat_example(s):
837
882
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
838
883
 
839
884
  ## Benchmark And Performance
840
- ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
841
- ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
842
-
843
- Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
885
+ Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
844
886
 
845
887
  ## Roadmap
846
888
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -848,3 +890,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
848
890
  ## Citation And Acknowledgment
849
891
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
850
892
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
893
+
894
+
895
+
896
+ <p align="center">
897
+ <a href="#sglangtop" target="_blank">
898
+ <bold>Back To Top </bold>
899
+ </a>
900
+ </p>
@@ -1,5 +1,5 @@
1
- <div align="center">
2
- <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
1
+ <div align="center" id="sglangtop">
2
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
3
3
 
4
4
  [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
5
5
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -11,15 +11,13 @@
11
11
 
12
12
  --------------------------------------------------------------------------------
13
13
 
14
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
15
-
16
- ## Upcoming Events
17
- - [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
18
- - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
14
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
15
+ [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
19
16
 
20
17
  ## News
21
- - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
22
- - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
18
+ - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
19
+ - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
20
+ - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
23
21
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
24
22
 
25
23
  <details>
@@ -65,7 +63,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
65
63
  ### Method 2: From source
66
64
  ```
67
65
  # Use the last release branch
68
- git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
66
+ git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
69
67
  cd sglang
70
68
 
71
69
  pip install --upgrade pip
@@ -242,6 +240,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
242
240
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
243
241
  ```
244
242
 
243
+ ### Engine Without HTTP Server
244
+
245
+ We also provide an inference engine **without a HTTP server**. For example,
246
+
247
+ ```python
248
+ import sglang as sgl
249
+
250
+
251
+ def main():
252
+ prompts = [
253
+ "Hello, my name is",
254
+ "The president of the United States is",
255
+ "The capital of France is",
256
+ "The future of AI is",
257
+ ]
258
+ sampling_params = {"temperature": 0.8, "top_p": 0.95}
259
+ llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
260
+
261
+ outputs = llm.generate(prompts, sampling_params)
262
+ for prompt, output in zip(prompts, outputs):
263
+ print("===============================")
264
+ print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
265
+
266
+ if __name__ == "__main__":
267
+ main()
268
+ ```
269
+
270
+ This can be used for:
271
+
272
+ 1. **Offline Batch Inference**
273
+ 2. **Building Custom Servers**
274
+
275
+ You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
276
+
245
277
  ### Supported Models
246
278
 
247
279
  **Generative Models**
@@ -578,10 +610,7 @@ def chat_example(s):
578
610
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
579
611
 
580
612
  ## Benchmark And Performance
581
- ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
582
- ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
583
-
584
- Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
613
+ Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
585
614
 
586
615
  ## Roadmap
587
616
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -589,3 +618,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
589
618
  ## Citation And Acknowledgment
590
619
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
591
620
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
621
+
622
+
623
+
624
+ <p align="center">
625
+ <a href="#sglangtop" target="_blank">
626
+ <bold>Back To Top </bold>
627
+ </a>
628
+ </p>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.3"
7
+ version = "0.3.4"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -20,16 +20,23 @@ dependencies = [
20
20
  ]
21
21
 
22
22
  [project.optional-dependencies]
23
- srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
- "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
- "torch", "torchao", "uvicorn", "uvloop", "zmq",
26
- "vllm==0.5.5", "outlines>=0.0.44", "modelscope"]
23
+ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
+ "orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
+ "torchao", "uvicorn", "uvloop", "zmq",
26
+ "outlines>=0.0.44", "modelscope"]
27
+ # xpu is not enabled in public vllm and torch whl,
28
+ # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
29
+ srt = ["sglang[runtime_common]", "torch", "vllm==0.5.5"]
30
+ srt_xpu = ["sglang[runtime_common]"]
31
+
27
32
  openai = ["openai>=1.0", "tiktoken"]
28
33
  anthropic = ["anthropic>=0.20.0"]
29
34
  litellm = ["litellm>=1.0.0"]
30
35
  test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate", "peft"]
31
36
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
37
+ all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
32
38
  dev = ["sglang[all]", "sglang[test]"]
39
+ dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
33
40
 
34
41
  [project.urls]
35
42
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -139,7 +139,7 @@ def load_model(server_args, port_args, tp_rank):
139
139
  gpu_id=tp_rank,
140
140
  tp_rank=tp_rank,
141
141
  tp_size=server_args.tp_size,
142
- nccl_port=port_args.nccl_ports[0],
142
+ nccl_port=port_args.nccl_port,
143
143
  server_args=server_args,
144
144
  )
145
145
  rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
@@ -220,6 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
220
220
  return reqs
221
221
 
222
222
 
223
+ @torch.inference_mode()
223
224
  def extend(reqs, model_runner):
224
225
  batch = ScheduleBatch.init_new(
225
226
  reqs=reqs,
@@ -231,26 +232,28 @@ def extend(reqs, model_runner):
231
232
  model_worker_batch = batch.get_model_worker_batch()
232
233
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
233
234
  logits_output = model_runner.forward(forward_batch)
234
- next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
235
+ next_token_ids = model_runner.sample(logits_output, forward_batch)
235
236
  return next_token_ids, logits_output.next_token_logits, batch
236
237
 
237
238
 
239
+ @torch.inference_mode()
238
240
  def decode(input_token_ids, batch, model_runner):
239
- batch.prepare_for_decode(input_token_ids)
241
+ batch.output_ids = input_token_ids
242
+ batch.prepare_for_decode()
240
243
  model_worker_batch = batch.get_model_worker_batch()
241
244
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
242
245
  logits_output = model_runner.forward(forward_batch)
243
- next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
246
+ next_token_ids = model_runner.sample(logits_output, forward_batch)
244
247
  return next_token_ids, logits_output.next_token_logits
245
248
 
246
249
 
247
- @torch.inference_mode()
248
250
  def correctness_test(
249
251
  server_args,
250
252
  port_args,
251
253
  bench_args,
252
254
  tp_rank,
253
255
  ):
256
+ configure_logger(server_args, prefix=f" TP{tp_rank}")
254
257
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
255
258
 
256
259
  # Load the model
@@ -278,8 +281,9 @@ def correctness_test(
278
281
  output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
279
282
  for _ in range(bench_args.output_len[0] - 1):
280
283
  next_token_ids, _ = decode(next_token_ids, batch, model_runner)
284
+ next_token_ids_list = next_token_ids.tolist()
281
285
  for i in range(len(reqs)):
282
- output_ids[i].append(next_token_ids[i])
286
+ output_ids[i].append(next_token_ids_list[i])
283
287
 
284
288
  # Print
285
289
  for i in range(len(reqs)):
@@ -287,9 +291,15 @@ def correctness_test(
287
291
  rank_print(tokenizer.decode(output_ids[i]), "\n")
288
292
 
289
293
 
290
- @torch.inference_mode()
294
+ def synchronize(device):
295
+ if device == "cuda":
296
+ torch.cuda.synchronize()
297
+ elif device == "xpu":
298
+ torch.xpu.synchronize()
299
+
300
+
291
301
  def latency_test_run_once(
292
- run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
302
+ run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
293
303
  ):
294
304
  max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
295
305
  if batch_size > max_batch_size:
@@ -312,10 +322,10 @@ def latency_test_run_once(
312
322
  tot_latency = 0
313
323
 
314
324
  # Prefill
315
- torch.cuda.synchronize()
325
+ synchronize(device)
316
326
  tic = time.time()
317
327
  next_token_ids, _, batch = extend(reqs, model_runner)
318
- torch.cuda.synchronize()
328
+ synchronize(device)
319
329
  prefill_latency = time.time() - tic
320
330
  tot_latency += prefill_latency
321
331
  throughput = input_len * batch_size / prefill_latency
@@ -328,10 +338,10 @@ def latency_test_run_once(
328
338
  # Decode
329
339
  decode_latencies = []
330
340
  for i in range(output_len - 1):
331
- torch.cuda.synchronize()
341
+ synchronize(device)
332
342
  tic = time.time()
333
343
  next_token_ids, _ = decode(next_token_ids, batch, model_runner)
334
- torch.cuda.synchronize()
344
+ synchronize(device)
335
345
  latency = time.time() - tic
336
346
  tot_latency += latency
337
347
  throughput = batch_size / latency
@@ -387,6 +397,7 @@ def latency_test(
387
397
  bench_args.batch_size[0],
388
398
  bench_args.input_len[0],
389
399
  8, # shorter decoding to speed up the warmup
400
+ server_args.device,
390
401
  )
391
402
  rank_print("Benchmark ...")
392
403
 
@@ -397,7 +408,14 @@ def latency_test(
397
408
  ):
398
409
  reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
399
410
  ret = latency_test_run_once(
400
- bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
411
+ bench_args.run_name,
412
+ model_runner,
413
+ rank_print,
414
+ reqs,
415
+ bs,
416
+ il,
417
+ ol,
418
+ server_args.device,
401
419
  )
402
420
  if ret is not None:
403
421
  result_list.append(ret)
@@ -6,6 +6,8 @@ It accepts arguments similar to those of launch_server.py.
6
6
  Usage:
7
7
 
8
8
  python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
9
+
10
+ python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
9
11
  """
10
12
 
11
13
  import argparse
@@ -32,6 +34,8 @@ class BenchArgs:
32
34
  input_len: Tuple[int] = (1024,)
33
35
  output_len: Tuple[int] = (16,)
34
36
  result_filename: str = "result.jsonl"
37
+ base_url: str = ""
38
+ skip_warmup: bool = False
35
39
 
36
40
  @staticmethod
37
41
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -48,6 +52,8 @@ class BenchArgs:
48
52
  parser.add_argument(
49
53
  "--result-filename", type=str, default=BenchArgs.result_filename
50
54
  )
55
+ parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
56
+ parser.add_argument("--skip-warmup", action="store_true")
51
57
 
52
58
  @classmethod
53
59
  def from_cli_args(cls, args: argparse.Namespace):
@@ -139,17 +145,21 @@ def run_one_case(
139
145
 
140
146
 
141
147
  def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
142
- proc, base_url = launch_server_process(server_args)
148
+ if bench_args.base_url:
149
+ proc, base_url = None, bench_args.base_url
150
+ else:
151
+ proc, base_url = launch_server_process(server_args)
143
152
 
144
153
  # warmup
145
- run_one_case(
146
- base_url,
147
- batch_size=16,
148
- input_len=1024,
149
- output_len=16,
150
- run_name="",
151
- result_filename="",
152
- )
154
+ if not bench_args.skip_warmup:
155
+ run_one_case(
156
+ base_url,
157
+ batch_size=16,
158
+ input_len=1024,
159
+ output_len=16,
160
+ run_name="",
161
+ result_filename="",
162
+ )
153
163
 
154
164
  # benchmark
155
165
  try:
@@ -165,7 +175,8 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
165
175
  bench_args.result_filename,
166
176
  )
167
177
  finally:
168
- kill_child_process(proc.pid)
178
+ if proc:
179
+ kill_child_process(proc.pid)
169
180
 
170
181
  print(f"\nResults are saved to {bench_args.result_filename}")
171
182
 
@@ -222,6 +222,85 @@ async def async_request_openai_completions(
222
222
  return output
223
223
 
224
224
 
225
+ async def async_request_sglang_generate(
226
+ request_func_input: RequestFuncInput,
227
+ pbar: Optional[tqdm] = None,
228
+ ) -> RequestFuncOutput:
229
+ api_url = request_func_input.api_url
230
+ prompt = request_func_input.prompt
231
+
232
+ async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
233
+ payload = {
234
+ "text": prompt,
235
+ "sampling_params": {
236
+ "temperature": 0.0,
237
+ "max_new_tokens": request_func_input.output_len,
238
+ "ignore_eos": not args.disable_ignore_eos,
239
+ },
240
+ "stream": not args.disable_stream,
241
+ **request_func_input.extra_request_body,
242
+ }
243
+ headers = {}
244
+
245
+ output = RequestFuncOutput()
246
+ output.prompt_len = request_func_input.prompt_len
247
+
248
+ generated_text = ""
249
+ ttft = 0.0
250
+ st = time.perf_counter()
251
+ most_recent_timestamp = st
252
+ try:
253
+ async with session.post(
254
+ url=api_url, json=payload, headers=headers
255
+ ) as response:
256
+ if response.status == 200:
257
+ async for chunk_bytes in response.content:
258
+ chunk_bytes = chunk_bytes.strip()
259
+ if not chunk_bytes:
260
+ continue
261
+ # print(chunk_bytes)
262
+
263
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
264
+ latency = time.perf_counter() - st
265
+ if chunk == "[DONE]":
266
+ pass
267
+ else:
268
+ data = json.loads(chunk)
269
+
270
+ # NOTE: Some completion API might have a last
271
+ # usage summary response without a token so we
272
+ # want to check a token was generated
273
+ if data["text"]:
274
+ timestamp = time.perf_counter()
275
+ # First token
276
+ if ttft == 0.0:
277
+ ttft = time.perf_counter() - st
278
+ output.ttft = ttft
279
+
280
+ # Decoding phase
281
+ else:
282
+ output.itl.append(timestamp - most_recent_timestamp)
283
+
284
+ most_recent_timestamp = timestamp
285
+ generated_text = data["text"]
286
+
287
+ output.generated_text = generated_text
288
+ output.success = True
289
+ output.latency = latency
290
+ output.output_len = request_func_input.output_len
291
+ else:
292
+ output.error = response.reason or ""
293
+ output.success = False
294
+ except Exception:
295
+ output.success = False
296
+ exc_info = sys.exc_info()
297
+ output.error = "".join(traceback.format_exception(*exc_info))
298
+
299
+ if pbar:
300
+ pbar.update(1)
301
+ return output
302
+
303
+
225
304
  async def async_request_gserver(
226
305
  request_func_input: RequestFuncInput,
227
306
  pbar: Optional[tqdm] = None,
@@ -264,7 +343,9 @@ def get_tokenizer(
264
343
 
265
344
 
266
345
  ASYNC_REQUEST_FUNCS = {
267
- "sglang": async_request_openai_completions,
346
+ "sglang": async_request_sglang_generate,
347
+ "sglang-native": async_request_sglang_generate,
348
+ "sglang-oai": async_request_openai_completions,
268
349
  "vllm": async_request_openai_completions,
269
350
  "lmdeploy": async_request_openai_completions,
270
351
  "trt": async_request_trt_llm,
@@ -387,6 +468,8 @@ def sample_sharegpt_requests(
387
468
  continue
388
469
  filtered_dataset.append((prompt, prompt_len, output_len))
389
470
 
471
+ print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
472
+ print(f"#Output tokens: {np.sum([x[2] for x in filtered_dataset])}")
390
473
  return filtered_dataset
391
474
 
392
475
 
@@ -587,6 +670,8 @@ async def benchmark(
587
670
  else:
588
671
  print("Initial test run completed. Starting main benchmark run...")
589
672
 
673
+ time.sleep(1.5)
674
+
590
675
  pbar = None if disable_tqdm else tqdm(total=len(input_requests))
591
676
 
592
677
  benchmark_start_time = time.perf_counter()
@@ -782,24 +867,33 @@ def run_benchmark(args_: argparse.Namespace):
782
867
  if args.port is None:
783
868
  args.port = {
784
869
  "sglang": 30000,
870
+ "sglang-native": 30000,
871
+ "sglang-oai": 30000,
785
872
  "lmdeploy": 23333,
786
873
  "vllm": 8000,
787
874
  "trt": 8000,
788
875
  "gserver": 9988,
789
876
  }.get(args.backend, 30000)
790
877
 
791
- api_url = (
792
- f"{args.base_url}/v1/completions"
793
- if args.base_url
794
- else f"http://{args.host}:{args.port}/v1/completions"
795
- )
796
878
  model_url = (
797
879
  f"{args.base_url}/v1/models"
798
880
  if args.base_url
799
881
  else f"http://{args.host}:{args.port}/v1/models"
800
882
  )
801
883
 
802
- if args.backend == "trt":
884
+ if args.backend in ["sglang", "sglang-native"]:
885
+ api_url = (
886
+ f"{args.base_url}/generate"
887
+ if args.base_url
888
+ else f"http://{args.host}:{args.port}/generate"
889
+ )
890
+ elif args.backend in ["sglang-oai", "vllm", "lmdeploy"]:
891
+ api_url = (
892
+ f"{args.base_url}/v1/completions"
893
+ if args.base_url
894
+ else f"http://{args.host}:{args.port}/v1/completions"
895
+ )
896
+ elif args.backend == "trt":
803
897
  api_url = (
804
898
  f"{args.base_url}/v2/models/ensemble/generate_stream"
805
899
  if args.base_url