sglang 0.3.5__tar.gz → 0.3.5.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. {sglang-0.3.5 → sglang-0.3.5.post2}/PKG-INFO +12 -8
  2. {sglang-0.3.5 → sglang-0.3.5.post2}/README.md +8 -5
  3. {sglang-0.3.5 → sglang-0.3.5.post2}/pyproject.toml +5 -4
  4. sglang-0.3.5.post2/sglang/bench_offline_throughput.py +309 -0
  5. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/bench_serving.py +148 -24
  6. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/configs/model_config.py +5 -2
  7. sglang-0.3.5.post2/sglang/srt/constrained/__init__.py +17 -0
  8. sglang-0.3.5.post2/sglang/srt/constrained/base_grammar_backend.py +73 -0
  9. sglang-0.3.5.post2/sglang/srt/constrained/outlines_backend.py +165 -0
  10. sglang-0.3.5.post2/sglang/srt/constrained/outlines_jump_forward.py +182 -0
  11. sglang-0.3.5.post2/sglang/srt/constrained/xgrammar_backend.py +150 -0
  12. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +7 -0
  13. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
  14. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/fused_moe/fused_moe.py +23 -7
  15. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/fused_moe/patch.py +4 -2
  16. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/quantization/base_config.py +4 -6
  17. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/vocab_parallel_embedding.py +216 -150
  18. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/detokenizer_manager.py +0 -14
  19. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/io_struct.py +5 -3
  20. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/schedule_batch.py +14 -20
  21. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/scheduler.py +159 -96
  22. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/tokenizer_manager.py +81 -17
  23. sglang-0.3.5.post2/sglang/srt/metrics/collector.py +211 -0
  24. sglang-0.3.5.post2/sglang/srt/metrics/func_timer.py +108 -0
  25. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mm_utils.py +1 -1
  26. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/model_executor/cuda_graph_runner.py +2 -2
  27. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/model_executor/forward_batch_info.py +7 -3
  28. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/model_executor/model_runner.py +6 -2
  29. sglang-0.3.5.post2/sglang/srt/models/gemma2_reward.py +69 -0
  30. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/gpt2.py +31 -37
  31. sglang-0.3.5.post2/sglang/srt/models/internlm2_reward.py +62 -0
  32. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llama.py +11 -6
  33. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llama_reward.py +5 -26
  34. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/olmo.py +0 -0
  35. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/qwen2_vl.py +5 -7
  36. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/openai_api/adapter.py +11 -4
  37. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/openai_api/protocol.py +29 -26
  38. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/sampling_batch_info.py +2 -3
  39. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/sampling_params.py +2 -16
  40. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/server.py +60 -17
  41. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/server_args.py +66 -25
  42. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/utils.py +120 -0
  43. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_common.py +1 -1
  44. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_humaneval.py +2 -2
  45. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_mgsm.py +2 -2
  46. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/test_utils.py +21 -7
  47. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/utils.py +1 -0
  48. sglang-0.3.5.post2/sglang/version.py +1 -0
  49. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/PKG-INFO +12 -8
  50. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/SOURCES.txt +9 -5
  51. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/requires.txt +3 -2
  52. sglang-0.3.5/sglang/srt/constrained/__init__.py +0 -81
  53. sglang-0.3.5/sglang/srt/constrained/base_tool_cache.py +0 -65
  54. sglang-0.3.5/sglang/srt/constrained/bnf_cache.py +0 -61
  55. sglang-0.3.5/sglang/srt/constrained/fsm_cache.py +0 -95
  56. sglang-0.3.5/sglang/srt/constrained/grammar.py +0 -190
  57. sglang-0.3.5/sglang/srt/constrained/jump_forward.py +0 -203
  58. sglang-0.3.5/sglang/version.py +0 -1
  59. {sglang-0.3.5 → sglang-0.3.5.post2}/LICENSE +0 -0
  60. {sglang-0.3.5 → sglang-0.3.5.post2}/setup.cfg +0 -0
  61. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/__init__.py +0 -0
  62. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/api.py +0 -0
  63. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/bench_latency.py +0 -0
  64. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/bench_server_latency.py +0 -0
  65. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/check_env.py +0 -0
  66. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/global_config.py +0 -0
  67. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/__init__.py +0 -0
  68. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/__init__.py +0 -0
  69. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/anthropic.py +0 -0
  70. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/base_backend.py +0 -0
  71. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/litellm.py +0 -0
  72. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/openai.py +0 -0
  73. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
  74. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/vertexai.py +0 -0
  75. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/chat_template.py +0 -0
  76. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/choices.py +0 -0
  77. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/compiler.py +0 -0
  78. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/interpreter.py +0 -0
  79. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/ir.py +0 -0
  80. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/tracer.py +0 -0
  81. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/launch_server.py +0 -0
  82. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/launch_server_llavavid.py +0 -0
  83. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/configs/__init__.py +0 -0
  84. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/configs/exaone.py +0 -0
  85. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/configs/qwen2vl.py +0 -0
  86. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/conversation.py +0 -0
  87. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/hf_transformers_utils.py +0 -0
  88. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/activation.py +0 -0
  89. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/__init__.py +0 -0
  90. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  91. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  92. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_backend.py +0 -0
  93. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  94. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  95. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  96. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/fused_moe/layer.py +0 -0
  97. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/layernorm.py +0 -0
  98. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/linear.py +0 -0
  99. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/logits_processor.py +0 -0
  100. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/pooler.py +0 -0
  101. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/quantization/__init__.py +0 -0
  102. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/radix_attention.py +0 -0
  103. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/rotary_embedding.py +0 -0
  104. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/sampler.py +0 -0
  105. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  106. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/lora/lora.py +0 -0
  107. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/lora/lora_config.py +0 -0
  108. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/lora/lora_manager.py +0 -0
  109. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/data_parallel_controller.py +0 -0
  110. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/image_processor.py +0 -0
  111. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/schedule_policy.py +0 -0
  112. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/tp_worker.py +0 -0
  113. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  114. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  115. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  116. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  117. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/memory_pool.py +0 -0
  118. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
  119. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/baichuan.py +0 -0
  120. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/chatglm.py +0 -0
  121. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/commandr.py +0 -0
  122. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/dbrx.py +0 -0
  123. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/deepseek.py +0 -0
  124. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/deepseek_v2.py +0 -0
  125. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/exaone.py +0 -0
  126. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/gemma.py +0 -0
  127. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/gemma2.py +0 -0
  128. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
  129. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/grok.py +0 -0
  130. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/internlm2.py +0 -0
  131. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llama_classification.py +0 -0
  132. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llama_embedding.py +0 -0
  133. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llava.py +0 -0
  134. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llavavid.py +0 -0
  135. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/minicpm.py +0 -0
  136. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/minicpm3.py +0 -0
  137. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/mistral.py +0 -0
  138. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/mixtral.py +0 -0
  139. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/mixtral_quant.py +0 -0
  140. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/mllama.py +0 -0
  141. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/olmoe.py +0 -0
  142. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/qwen.py +0 -0
  143. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/qwen2.py +0 -0
  144. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/qwen2_moe.py +0 -0
  145. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/stablelm.py +0 -0
  146. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/torch_native_llama.py +0 -0
  147. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/xverse.py +0 -0
  148. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/xverse_moe.py +0 -0
  149. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/yivl.py +0 -0
  150. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  151. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  152. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  153. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  154. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  155. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  156. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/few_shot_gsm8k.py +0 -0
  157. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  158. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/run_eval.py +0 -0
  159. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/runners.py +0 -0
  160. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  161. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_math.py +0 -0
  162. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  163. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  164. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/test_activation.py +0 -0
  165. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/test_layernorm.py +0 -0
  166. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/test_programs.py +0 -0
  167. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/dependency_links.txt +0 -0
  168. {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.5
3
+ Version: 0.3.5.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -225,14 +225,15 @@ Requires-Dist: interegular; extra == "runtime-common"
225
225
  Requires-Dist: orjson; extra == "runtime-common"
226
226
  Requires-Dist: packaging; extra == "runtime-common"
227
227
  Requires-Dist: pillow; extra == "runtime-common"
228
+ Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
228
229
  Requires-Dist: psutil; extra == "runtime-common"
229
230
  Requires-Dist: pydantic; extra == "runtime-common"
230
231
  Requires-Dist: python-multipart; extra == "runtime-common"
231
232
  Requires-Dist: torchao; extra == "runtime-common"
232
233
  Requires-Dist: uvicorn; extra == "runtime-common"
233
234
  Requires-Dist: uvloop; extra == "runtime-common"
234
- Requires-Dist: zmq; extra == "runtime-common"
235
- Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
235
+ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
236
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
236
237
  Requires-Dist: modelscope; extra == "runtime-common"
237
238
  Provides-Extra: srt
238
239
  Requires-Dist: sglang[runtime_common]; extra == "srt"
@@ -291,13 +292,14 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
291
292
  [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
292
293
  [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
293
294
  [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
295
+ [![](https://img.shields.io/badge/Gurubase-(experimental)-006BFF)](https://gurubase.io/g/sglang)
294
296
 
295
297
  </div>
296
298
 
297
299
  --------------------------------------------------------------------------------
298
300
 
299
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
300
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
301
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
302
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
301
303
 
302
304
  ## News
303
305
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
@@ -321,11 +323,13 @@ The core features include:
321
323
 
322
324
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
323
325
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
324
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
326
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
325
327
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
326
328
 
327
- ## Install
328
- See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
329
+ ## Getting Started
330
+ Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
331
+
332
+ Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
329
333
 
330
334
  ## Backend: SGLang Runtime (SRT)
331
335
  See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
@@ -6,13 +6,14 @@
6
6
  [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
7
7
  [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
8
8
  [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
9
+ [![](https://img.shields.io/badge/Gurubase-(experimental)-006BFF)](https://gurubase.io/g/sglang)
9
10
 
10
11
  </div>
11
12
 
12
13
  --------------------------------------------------------------------------------
13
14
 
14
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
15
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
15
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
16
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
16
17
 
17
18
  ## News
18
19
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
@@ -36,11 +37,13 @@ The core features include:
36
37
 
37
38
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
38
39
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
39
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
40
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
40
41
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
41
42
 
42
- ## Install
43
- See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
43
+ ## Getting Started
44
+ Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
45
+
46
+ Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
44
47
 
45
48
  ## Backend: SGLang Runtime (SRT)
46
49
  See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.5"
7
+ version = "0.3.5.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -17,10 +17,11 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
17
17
 
18
18
  [project.optional-dependencies]
19
19
  runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
20
- "orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
21
- "torchao", "uvicorn", "uvloop", "zmq",
22
- "outlines>=0.0.44", "modelscope"]
20
+ "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
21
+ "torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
22
+ "outlines>=0.0.44,<0.1.0", "modelscope"]
23
23
  srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
24
+
24
25
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
25
26
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
26
27
  srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
@@ -0,0 +1,309 @@
1
+ """
2
+ Benchmark the throughput of using the offline LLM engine.
3
+ This script does not launch a server.
4
+ It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
5
+
6
+ # Usage
7
+ ## Sharegpt dataset with default args
8
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
9
+
10
+ ## Random dataset with default args
11
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
12
+
13
+ ## Shared prefix dataset with default args
14
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
15
+
16
+ ## Sharegpt dataset on runtime backend
17
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
18
+ """
19
+
20
+ import argparse
21
+ import dataclasses
22
+ import json
23
+ import logging
24
+ import random
25
+ import time
26
+ from typing import List, Optional, Tuple
27
+
28
+ import numpy as np
29
+
30
+ from sglang.api import Engine
31
+ from sglang.bench_serving import (
32
+ get_dataset,
33
+ get_tokenizer,
34
+ sample_random_requests,
35
+ set_ulimit,
36
+ )
37
+ from sglang.srt.server import Runtime
38
+ from sglang.srt.server_args import ServerArgs
39
+
40
+
41
+ @dataclasses.dataclass
42
+ class BenchArgs:
43
+ backend: str = "engine"
44
+ result_filename: str = ""
45
+ dataset_name: str = "sharegpt"
46
+ dataset_path: str = ""
47
+ num_prompts: int = 1000
48
+ sharegpt_output_len: Optional[int] = None
49
+ random_input_len: int = 1024
50
+ random_output_len: int = 1024
51
+ random_range_ratio: float = 0.0
52
+ gen_num_groups: int = 64
53
+ gen_prompts_per_group: int = 16
54
+ gen_system_prompt_len: int = 2048
55
+ gen_question_len: int = 128
56
+ gen_output_len: int = 256
57
+ disable_ignore_eos: bool = False
58
+ seed: int = 1
59
+
60
+ @staticmethod
61
+ def add_cli_args(parser: argparse.ArgumentParser):
62
+ parser.add_argument("--backend", type=str, default=BenchArgs.backend)
63
+ parser.add_argument(
64
+ "--result-filename", type=str, default=BenchArgs.result_filename
65
+ )
66
+ parser.add_argument(
67
+ "--dataset-name",
68
+ type=str,
69
+ default="sharegpt",
70
+ choices=["sharegpt", "random", "generated-shared-prefix"],
71
+ help="Name of the dataset to benchmark on.",
72
+ )
73
+ parser.add_argument(
74
+ "--dataset-path", type=str, default="", help="Path to the dataset."
75
+ )
76
+ parser.add_argument(
77
+ "--num-prompts",
78
+ type=int,
79
+ default=BenchArgs.num_prompts,
80
+ help="Number of prompts to process. Default is 1000.",
81
+ )
82
+ parser.add_argument(
83
+ "--sharegpt-output-len",
84
+ type=int,
85
+ default=BenchArgs.sharegpt_output_len,
86
+ help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
87
+ )
88
+ parser.add_argument(
89
+ "--random-input-len",
90
+ type=int,
91
+ default=BenchArgs.random_input_len,
92
+ help="Number of input tokens per request, used only for random dataset.",
93
+ )
94
+ parser.add_argument(
95
+ "--random-output-len",
96
+ type=int,
97
+ default=BenchArgs.random_output_len,
98
+ help="Number of output tokens per request, used only for random dataset.",
99
+ )
100
+ parser.add_argument(
101
+ "--random-range-ratio",
102
+ type=float,
103
+ default=BenchArgs.random_range_ratio,
104
+ help="Range of sampled ratio of input/output length, "
105
+ "used only for random dataset.",
106
+ )
107
+ parser.add_argument(
108
+ "--gen-num-groups",
109
+ type=int,
110
+ default=BenchArgs.gen_num_groups,
111
+ help="Number of groups with shared prefix, used"
112
+ "only for generate-shared-prefix",
113
+ )
114
+ parser.add_argument(
115
+ "--gen-prompts-per-group",
116
+ type=int,
117
+ default=BenchArgs.gen_prompts_per_group,
118
+ help="Number of prompts per group of shared prefix, used"
119
+ "only for generate-shared-prefix",
120
+ )
121
+ parser.add_argument(
122
+ "--gen-system-prompt-len",
123
+ type=int,
124
+ default=BenchArgs.gen_system_prompt_len,
125
+ help="System prompt length, used" "only for generate-shared-prefix",
126
+ )
127
+ parser.add_argument(
128
+ "--gen-question-len",
129
+ type=int,
130
+ default=BenchArgs.gen_question_len,
131
+ help="Question length, used" "only for generate-shared-prefix",
132
+ )
133
+ parser.add_argument(
134
+ "--gen-output-len",
135
+ type=int,
136
+ default=BenchArgs.gen_output_len,
137
+ help="Target length in tokens for outputs in generated-shared-prefix dataset",
138
+ )
139
+ parser.add_argument(
140
+ "--disable-ignore-eos",
141
+ type=bool,
142
+ default=BenchArgs.disable_ignore_eos,
143
+ help="Disable ignore EOS token",
144
+ )
145
+ parser.add_argument("--seed", type=int, default=1, help="The random seed.")
146
+
147
+ @classmethod
148
+ def from_cli_args(cls, args: argparse.Namespace):
149
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
150
+ return cls(**{attr: getattr(args, attr) for attr in attrs})
151
+
152
+
153
+ def throughput_test_once(
154
+ backend_name: str,
155
+ backend,
156
+ reqs: List[Tuple[str, int, int]],
157
+ ignore_eos: bool,
158
+ ):
159
+ measurement_results = {
160
+ "backend": backend_name,
161
+ "successful_requests": len(reqs),
162
+ "total_latency": -1,
163
+ "total_input_tokens": sum(r[1] for r in reqs),
164
+ "total_output_tokens": -1,
165
+ "request_throughput": -1,
166
+ "input_throughput": -1,
167
+ "output_throughput": -1,
168
+ "total_throughput": -1,
169
+ }
170
+
171
+ prompt = [r[0] for r in reqs]
172
+ sampling_params = [
173
+ {
174
+ "temperature": 0,
175
+ "max_new_tokens": r[2],
176
+ "ignore_eos": ignore_eos,
177
+ }
178
+ for r in reqs
179
+ ]
180
+
181
+ st = time.perf_counter()
182
+ gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
183
+ latency = time.perf_counter() - st
184
+
185
+ if backend_name == "runtime":
186
+ gen_out = json.loads(gen_out)
187
+
188
+ measurement_results["total_latency"] = latency
189
+ measurement_results["total_output_tokens"] = sum(
190
+ o["meta_info"]["completion_tokens"] for o in gen_out
191
+ )
192
+ measurement_results["request_throughput"] = (
193
+ measurement_results["successful_requests"] / latency
194
+ )
195
+ measurement_results["input_throughput"] = (
196
+ measurement_results["total_input_tokens"] / latency
197
+ )
198
+ measurement_results["output_throughput"] = (
199
+ measurement_results["total_output_tokens"] / latency
200
+ )
201
+ measurement_results["total_throughput"] = (
202
+ measurement_results["total_input_tokens"]
203
+ + measurement_results["total_output_tokens"]
204
+ ) / latency
205
+
206
+ return measurement_results
207
+
208
+
209
+ def throughput_test(
210
+ server_args: ServerArgs,
211
+ bench_args: BenchArgs,
212
+ ):
213
+ if bench_args.backend == "engine":
214
+ backend = Engine(**dataclasses.asdict(server_args))
215
+ if not backend:
216
+ raise ValueError("Please provide valid engine arguments")
217
+ elif bench_args.backend == "runtime":
218
+ backend = Runtime(**dataclasses.asdict(server_args))
219
+ else:
220
+ raise ValueError('Please set backend to either "engine" or "runtime"')
221
+
222
+ tokenizer_id = server_args.model_path
223
+ tokenizer = get_tokenizer(tokenizer_id)
224
+
225
+ # Set global environmnets
226
+ set_ulimit()
227
+ random.seed(bench_args.seed)
228
+ np.random.seed(bench_args.seed)
229
+
230
+ # Read dataset
231
+ input_requests = get_dataset(bench_args, tokenizer)
232
+
233
+ warmup_requests = sample_random_requests(
234
+ input_len=20,
235
+ output_len=4,
236
+ num_prompts=2,
237
+ range_ratio=0.8,
238
+ tokenizer=tokenizer,
239
+ dataset_path=bench_args.dataset_path,
240
+ )
241
+
242
+ # Warm up
243
+ throughput_test_once(
244
+ backend_name=bench_args.backend,
245
+ backend=backend,
246
+ reqs=warmup_requests,
247
+ ignore_eos=not bench_args.disable_ignore_eos,
248
+ )
249
+
250
+ result = throughput_test_once(
251
+ backend_name=bench_args.backend,
252
+ backend=backend,
253
+ reqs=input_requests,
254
+ ignore_eos=not bench_args.disable_ignore_eos,
255
+ )
256
+
257
+ if bench_args.result_filename:
258
+ with open(bench_args.result_filename, "a") as fout:
259
+ fout.write(json.dumps(result) + "\n")
260
+
261
+ print(
262
+ "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
263
+ )
264
+ print("{:<40} {:<10}".format("Backend:", result["backend"]))
265
+ print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"]))
266
+ print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"]))
267
+ print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
268
+ print(
269
+ "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
270
+ )
271
+ print(
272
+ "{:<40} {:<10.2f}".format(
273
+ "Request throughput (req/s):", result["request_throughput"]
274
+ )
275
+ )
276
+ print(
277
+ "{:<40} {:<10.2f}".format(
278
+ "Input token throughput (tok/s):", result["input_throughput"]
279
+ )
280
+ )
281
+ print(
282
+ "{:<40} {:<10.2f}".format(
283
+ "Output token throughput (tok/s):", result["output_throughput"]
284
+ )
285
+ )
286
+ print(
287
+ "{:<40} {:<10.2f}".format(
288
+ "Total token throughput (tok/s):", result["total_throughput"]
289
+ )
290
+ )
291
+ print("=" * 50)
292
+
293
+ return result
294
+
295
+
296
+ if __name__ == "__main__":
297
+ parser = argparse.ArgumentParser()
298
+ ServerArgs.add_cli_args(parser)
299
+ BenchArgs.add_cli_args(parser)
300
+ args = parser.parse_args()
301
+ server_args = ServerArgs.from_cli_args(args)
302
+ bench_args = BenchArgs.from_cli_args(args)
303
+
304
+ logging.basicConfig(
305
+ level=getattr(logging, server_args.log_level.upper()),
306
+ format="%(message)s",
307
+ )
308
+
309
+ throughput_test(server_args, bench_args)
@@ -421,6 +421,37 @@ def get_tokenizer(
421
421
  )
422
422
 
423
423
 
424
+ def get_dataset(args, tokenizer):
425
+ if args.dataset_name == "sharegpt":
426
+ input_requests = sample_sharegpt_requests(
427
+ dataset_path=args.dataset_path,
428
+ num_requests=args.num_prompts,
429
+ tokenizer=tokenizer,
430
+ fixed_output_len=args.sharegpt_output_len,
431
+ )
432
+ elif args.dataset_name == "random":
433
+ input_requests = sample_random_requests(
434
+ input_len=args.random_input_len,
435
+ output_len=args.random_output_len,
436
+ num_prompts=args.num_prompts,
437
+ range_ratio=args.random_range_ratio,
438
+ tokenizer=tokenizer,
439
+ dataset_path=args.dataset_path,
440
+ )
441
+ elif args.dataset_name == "generated-shared-prefix":
442
+ input_requests = sample_generated_shared_prefix_requests(
443
+ num_groups=args.gen_num_groups,
444
+ prompts_per_group=args.gen_prompts_per_group,
445
+ system_prompt_len=args.gen_system_prompt_len,
446
+ question_len=args.gen_question_len,
447
+ output_len=args.gen_output_len,
448
+ tokenizer=tokenizer,
449
+ )
450
+ else:
451
+ raise ValueError(f"Unknown dataset: {args.dataset_name}")
452
+ return input_requests
453
+
454
+
424
455
  ASYNC_REQUEST_FUNCS = {
425
456
  "sglang": async_request_sglang_generate,
426
457
  "sglang-native": async_request_sglang_generate,
@@ -443,6 +474,8 @@ class BenchmarkMetrics:
443
474
  input_throughput: float
444
475
  output_throughput: float
445
476
  output_throughput_retokenized: float
477
+ total_throughput: float
478
+ total_throughput_retokenized: float
446
479
  mean_ttft_ms: float
447
480
  median_ttft_ms: float
448
481
  std_ttft_ms: float
@@ -590,18 +623,25 @@ def sample_random_requests(
590
623
  (data["conversations"][0]["value"], data["conversations"][1]["value"])
591
624
  for data in dataset
592
625
  ]
593
-
594
626
  # Shuffle the dataset.
595
627
  random.shuffle(dataset)
596
628
 
597
629
  # Filter out sequences that are too long or too short
598
630
  input_requests: List[Tuple[str, int, int]] = []
599
- for i in range(num_prompts):
631
+ for data in dataset:
632
+ i = len(input_requests)
633
+ if i == num_prompts:
634
+ break
635
+
600
636
  # Tokenize the prompts and completions.
601
- prompt = dataset[i][0]
637
+ prompt = data[0]
602
638
  prompt_token_ids = tokenizer.encode(prompt)
603
639
  prompt_len = len(prompt_token_ids)
604
640
 
641
+ # Skip empty prompt
642
+ if prompt_len == 0:
643
+ continue
644
+
605
645
  if prompt_len > input_lens[i]:
606
646
  input_ids = prompt_token_ids[: input_lens[i]]
607
647
  else:
@@ -627,6 +667,66 @@ def sample_random_requests(
627
667
  return input_requests
628
668
 
629
669
 
670
+ def gen_prompt(tokenizer, token_num):
671
+ """Generate a random prompt of specified token length using tokenizer vocabulary."""
672
+ all_available_tokens = list(tokenizer.get_vocab().values())
673
+ selected_tokens = random.choices(all_available_tokens, k=token_num)
674
+ return tokenizer.decode(selected_tokens)
675
+
676
+
677
+ def sample_generated_shared_prefix_requests(
678
+ num_groups: int,
679
+ prompts_per_group: int,
680
+ system_prompt_len: int,
681
+ question_len: int,
682
+ output_len: int,
683
+ tokenizer: PreTrainedTokenizerBase,
684
+ ) -> List[Tuple[str, int, int]]:
685
+ """Generate benchmark requests with shared system prompts using random tokens."""
686
+ # Generate system prompts for each group
687
+ system_prompts = []
688
+ for _ in range(num_groups):
689
+ system_prompt = gen_prompt(tokenizer, system_prompt_len)
690
+ system_prompts.append(system_prompt)
691
+
692
+ # Generate questions
693
+ questions = []
694
+ for _ in range(num_groups * prompts_per_group):
695
+ question = gen_prompt(tokenizer, question_len)
696
+ questions.append(question)
697
+
698
+ # Combine system prompts with questions
699
+ input_requests = []
700
+ total_input_tokens = 0
701
+ total_output_tokens = 0
702
+
703
+ for group_idx in range(num_groups):
704
+ system_prompt = system_prompts[group_idx]
705
+ for prompt_idx in range(prompts_per_group):
706
+ question = questions[group_idx * prompts_per_group + prompt_idx]
707
+ full_prompt = f"{system_prompt}\n\n{question}"
708
+ prompt_len = len(tokenizer.encode(full_prompt))
709
+
710
+ input_requests.append((full_prompt, prompt_len, output_len))
711
+ total_input_tokens += prompt_len
712
+ total_output_tokens += output_len
713
+
714
+ print(f"\nGenerated shared prefix dataset statistics:")
715
+ print(f"Number of groups: {num_groups}")
716
+ print(f"Prompts per group: {prompts_per_group}")
717
+ print(f"Total prompts: {len(input_requests)}")
718
+ print(f"Total input tokens: {total_input_tokens}")
719
+ print(f"Total output tokens: {total_output_tokens}")
720
+ print(
721
+ f"Average system prompt length: {sum(len(tokenizer.encode(sp)) for sp in system_prompts) / len(system_prompts):.1f} tokens"
722
+ )
723
+ print(
724
+ f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
725
+ )
726
+
727
+ return input_requests
728
+
729
+
630
730
  async def get_request(
631
731
  input_requests: List[Tuple[str, int, int]],
632
732
  request_rate: float,
@@ -696,6 +796,9 @@ def calculate_metrics(
696
796
  input_throughput=total_input / dur_s,
697
797
  output_throughput=sum(output_lens) / dur_s,
698
798
  output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
799
+ total_throughput=(total_input + sum(output_lens)) / dur_s,
800
+ total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
801
+ / dur_s,
699
802
  mean_ttft_ms=np.mean(ttfts or 0)
700
803
  * 1000, # ttfts is empty if streaming is not supported by backend
701
804
  median_ttft_ms=np.median(ttfts or 0) * 1000,
@@ -813,6 +916,11 @@ async def benchmark(
813
916
  "Output token throughput (tok/s):", metrics.output_throughput
814
917
  )
815
918
  )
919
+ print(
920
+ "{:<40} {:<10.2f}".format(
921
+ "Total token throughput (tok/s):", metrics.total_throughput
922
+ )
923
+ )
816
924
  print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
817
925
  print(
818
926
  "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1030,26 +1138,7 @@ def run_benchmark(args_: argparse.Namespace):
1030
1138
 
1031
1139
  tokenizer = get_tokenizer(tokenizer_id)
1032
1140
 
1033
- if args.dataset_name == "sharegpt":
1034
- assert args.random_input_len is None and args.random_output_len is None
1035
- input_requests = sample_sharegpt_requests(
1036
- dataset_path=args.dataset_path,
1037
- num_requests=args.num_prompts,
1038
- tokenizer=tokenizer,
1039
- fixed_output_len=args.sharegpt_output_len,
1040
- )
1041
- elif args.dataset_name == "random":
1042
- assert args.random_input_len is not None and args.random_output_len is not None
1043
- input_requests = sample_random_requests(
1044
- input_len=args.random_input_len,
1045
- output_len=args.random_output_len,
1046
- num_prompts=args.num_prompts,
1047
- range_ratio=args.random_range_ratio,
1048
- tokenizer=tokenizer,
1049
- dataset_path=args.dataset_path,
1050
- )
1051
- else:
1052
- raise ValueError(f"Unknown dataset: {args.dataset_name}")
1141
+ input_requests = get_dataset(args, tokenizer)
1053
1142
 
1054
1143
  if not args.multi:
1055
1144
  return asyncio.run(
@@ -1121,7 +1210,7 @@ if __name__ == "__main__":
1121
1210
  "--dataset-name",
1122
1211
  type=str,
1123
1212
  default="sharegpt",
1124
- choices=["sharegpt", "random"],
1213
+ choices=["sharegpt", "random", "generated-shared-prefix"],
1125
1214
  help="Name of the dataset to benchmark on.",
1126
1215
  )
1127
1216
  parser.add_argument(
@@ -1152,10 +1241,12 @@ if __name__ == "__main__":
1152
1241
  parser.add_argument(
1153
1242
  "--random-input-len",
1154
1243
  type=int,
1244
+ default=1024,
1155
1245
  help="Number of input tokens per request, used only for random dataset.",
1156
1246
  )
1157
1247
  parser.add_argument(
1158
1248
  "--random-output-len",
1249
+ default=1024,
1159
1250
  type=int,
1160
1251
  help="Number of output tokens per request, used only for random dataset.",
1161
1252
  )
@@ -1208,5 +1299,38 @@ if __name__ == "__main__":
1208
1299
  help="Append given JSON object to the request payload. You can use this to specify"
1209
1300
  "additional generate params like sampling params.",
1210
1301
  )
1302
+
1303
+ group = parser.add_argument_group("generated-shared-prefix dataset arguments")
1304
+ group.add_argument(
1305
+ "--gen-num-groups",
1306
+ type=int,
1307
+ default=64,
1308
+ help="Number of system prompt groups for generated-shared-prefix dataset",
1309
+ )
1310
+ group.add_argument(
1311
+ "--gen-prompts-per-group",
1312
+ type=int,
1313
+ default=16,
1314
+ help="Number of prompts per system prompt group for generated-shared-prefix dataset",
1315
+ )
1316
+ group.add_argument(
1317
+ "--gen-system-prompt-len",
1318
+ type=int,
1319
+ default=2048,
1320
+ help="Target length in tokens for system prompts in generated-shared-prefix dataset",
1321
+ )
1322
+ group.add_argument(
1323
+ "--gen-question-len",
1324
+ type=int,
1325
+ default=128,
1326
+ help="Target length in tokens for questions in generated-shared-prefix dataset",
1327
+ )
1328
+ group.add_argument(
1329
+ "--gen-output-len",
1330
+ type=int,
1331
+ default=256,
1332
+ help="Target length in tokens for outputs in generated-shared-prefix dataset",
1333
+ )
1334
+
1211
1335
  args = parser.parse_args()
1212
1336
  run_benchmark(args)