sglang 0.3.6__tar.gz → 0.3.6.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. {sglang-0.3.6 → sglang-0.3.6.post2}/LICENSE +1 -1
  2. {sglang-0.3.6 → sglang-0.3.6.post2}/PKG-INFO +25 -15
  3. {sglang-0.3.6 → sglang-0.3.6.post2}/README.md +10 -12
  4. {sglang-0.3.6 → sglang-0.3.6.post2}/pyproject.toml +9 -4
  5. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/__init__.py +2 -2
  6. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/api.py +2 -2
  7. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_one_batch.py +4 -7
  8. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_one_batch_server.py +2 -2
  9. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_serving.py +75 -26
  10. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/check_env.py +7 -1
  11. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/base_backend.py +1 -1
  12. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/runtime_endpoint.py +2 -2
  13. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/tracer.py +1 -1
  14. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/launch_server.py +0 -3
  15. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/configs/model_config.py +15 -20
  16. sglang-0.3.6.post2/sglang/srt/constrained/__init__.py +16 -0
  17. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/constrained/base_grammar_backend.py +13 -15
  18. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/constrained/outlines_backend.py +13 -15
  19. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/constrained/outlines_jump_forward.py +13 -15
  20. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/constrained/xgrammar_backend.py +38 -57
  21. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/conversation.py +13 -15
  22. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/hf_transformers_utils.py +13 -15
  23. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/activation.py +13 -13
  24. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/flashinfer_backend.py +14 -7
  25. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
  26. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
  27. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
  28. sglang-0.3.6.post2/sglang/srt/layers/custom_op_util.py +25 -0
  29. sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
  30. {sglang-0.3.6/sglang/srt/layers/fused_moe → sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok}/layer.py +4 -9
  31. sglang-0.3.6/sglang/srt/layers/fused_moe/patch.py → sglang-0.3.6.post2/sglang/srt/layers/fused_moe_patch.py +5 -0
  32. sglang-0.3.6.post2/sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
  33. sglang-0.3.6.post2/sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
  34. sglang-0.3.6.post2/sglang/srt/layers/fused_moe_triton/layer.py +633 -0
  35. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/layernorm.py +13 -15
  36. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/logits_processor.py +13 -15
  37. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/quantization/__init__.py +77 -17
  38. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/radix_attention.py +13 -15
  39. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/rotary_embedding.py +13 -13
  40. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/sampler.py +1 -1
  41. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/lora/lora.py +13 -14
  42. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/lora/lora_config.py +13 -14
  43. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/lora/lora_manager.py +22 -24
  44. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/data_parallel_controller.py +25 -19
  45. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/detokenizer_manager.py +13 -18
  46. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/image_processor.py +6 -9
  47. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/io_struct.py +43 -28
  48. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/schedule_batch.py +92 -27
  49. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/schedule_policy.py +13 -15
  50. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/scheduler.py +94 -72
  51. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/session_controller.py +29 -19
  52. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/tokenizer_manager.py +29 -22
  53. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/tp_worker.py +13 -15
  54. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
  55. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/metrics/collector.py +13 -15
  56. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/metrics/func_timer.py +13 -15
  57. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mm_utils.py +13 -14
  58. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/model_executor/cuda_graph_runner.py +20 -19
  59. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/model_executor/forward_batch_info.py +19 -17
  60. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/model_executor/model_runner.py +42 -30
  61. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/chatglm.py +15 -16
  62. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/commandr.py +15 -16
  63. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/dbrx.py +15 -16
  64. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/deepseek.py +15 -15
  65. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/deepseek_v2.py +15 -15
  66. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/exaone.py +14 -15
  67. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gemma.py +14 -14
  68. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gemma2.py +24 -19
  69. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gemma2_reward.py +13 -14
  70. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gpt_bigcode.py +14 -14
  71. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/grok.py +15 -15
  72. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/internlm2.py +13 -15
  73. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/internlm2_reward.py +13 -14
  74. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llama.py +21 -21
  75. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llama_classification.py +13 -14
  76. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llama_reward.py +13 -14
  77. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llava.py +20 -16
  78. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llavavid.py +13 -15
  79. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/minicpm.py +13 -15
  80. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/minicpm3.py +13 -15
  81. sglang-0.3.6.post2/sglang/srt/models/mistral.py +23 -0
  82. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/mixtral.py +15 -15
  83. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/mixtral_quant.py +14 -14
  84. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/olmo.py +21 -19
  85. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/olmoe.py +23 -20
  86. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/qwen.py +14 -14
  87. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/qwen2.py +22 -19
  88. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/qwen2_moe.py +17 -18
  89. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/stablelm.py +18 -16
  90. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/torch_native_llama.py +15 -17
  91. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/xverse.py +13 -14
  92. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/xverse_moe.py +15 -16
  93. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/yivl.py +13 -15
  94. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/openai_api/adapter.py +13 -15
  95. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/openai_api/protocol.py +13 -15
  96. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/sampling_batch_info.py +4 -1
  97. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/sampling_params.py +13 -15
  98. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/server.py +60 -34
  99. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/server_args.py +22 -22
  100. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/utils.py +208 -19
  101. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/few_shot_gsm8k.py +8 -4
  102. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/runners.py +13 -14
  103. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/test_utils.py +2 -2
  104. sglang-0.3.6.post2/sglang/version.py +1 -0
  105. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/PKG-INFO +25 -15
  106. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/SOURCES.txt +7 -4
  107. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/requires.txt +16 -1
  108. sglang-0.3.6/sglang/srt/constrained/__init__.py +0 -17
  109. sglang-0.3.6/sglang/srt/layers/custom_op_util.py +0 -26
  110. sglang-0.3.6/sglang/srt/layers/fused_moe/__init__.py +0 -1
  111. sglang-0.3.6/sglang/srt/models/mistral.py +0 -25
  112. sglang-0.3.6/sglang/version.py +0 -1
  113. {sglang-0.3.6 → sglang-0.3.6.post2}/setup.cfg +0 -0
  114. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_latency.py +0 -0
  115. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_offline_throughput.py +0 -0
  116. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/global_config.py +0 -0
  117. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/__init__.py +0 -0
  118. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/__init__.py +0 -0
  119. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/anthropic.py +0 -0
  120. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/litellm.py +0 -0
  121. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/openai.py +0 -0
  122. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/vertexai.py +0 -0
  123. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/chat_template.py +0 -0
  124. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/choices.py +0 -0
  125. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/compiler.py +0 -0
  126. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/interpreter.py +0 -0
  127. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/ir.py +0 -0
  128. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/launch_server_llavavid.py +0 -0
  129. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/configs/__init__.py +0 -0
  130. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/configs/exaone.py +0 -0
  131. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/configs/qwen2vl.py +0 -0
  132. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/__init__.py +0 -0
  133. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  134. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_backend.py +0 -0
  135. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  136. {sglang-0.3.6/sglang/srt/layers/fused_moe → sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok}/fused_moe.py +0 -0
  137. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/linear.py +0 -0
  138. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/pooler.py +0 -0
  139. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
  140. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  141. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  142. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  143. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  144. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  145. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/memory_pool.py +0 -0
  146. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
  147. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/model_parallel.py +0 -0
  148. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/baichuan.py +0 -0
  149. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gpt2.py +0 -0
  150. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llama_embedding.py +0 -0
  151. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/mllama.py +0 -0
  152. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/phi3_small.py +0 -0
  153. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/qwen2_vl.py +0 -0
  154. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  155. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  156. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  157. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  158. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  159. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  160. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  161. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/run_eval.py +0 -0
  162. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_common.py +0 -0
  163. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  164. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  165. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_math.py +0 -0
  166. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  167. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  168. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  169. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/test_activation.py +0 -0
  170. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/test_layernorm.py +0 -0
  171. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/test_programs.py +0 -0
  172. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/utils.py +0 -0
  173. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/dependency_links.txt +0 -0
  174. {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -186,7 +186,7 @@
186
186
  same "printed page" as the copyright notice for easier
187
187
  identification within third-party archives.
188
188
 
189
- Copyright [yyyy] [name of copyright owner]
189
+ Copyright 2023-2024 SGLang Team
190
190
 
191
191
  Licensed under the Apache License, Version 2.0 (the "License");
192
192
  you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.6
3
+ Version: 0.3.6.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -190,7 +190,7 @@ License: Apache License
190
190
  same "printed page" as the copyright notice for easier
191
191
  identification within third-party archives.
192
192
 
193
- Copyright [yyyy] [name of copyright owner]
193
+ Copyright 2023-2024 SGLang Team
194
194
 
195
195
  Licensed under the Apache License, Version 2.0 (the "License");
196
196
  you may not use this file except in compliance with the License.
@@ -222,6 +222,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
222
222
  Requires-Dist: hf_transfer; extra == "runtime-common"
223
223
  Requires-Dist: huggingface_hub; extra == "runtime-common"
224
224
  Requires-Dist: interegular; extra == "runtime-common"
225
+ Requires-Dist: modelscope; extra == "runtime-common"
225
226
  Requires-Dist: orjson; extra == "runtime-common"
226
227
  Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
227
228
  Requires-Dist: packaging; extra == "runtime-common"
@@ -234,17 +235,20 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
234
235
  Requires-Dist: torchao; extra == "runtime-common"
235
236
  Requires-Dist: uvicorn; extra == "runtime-common"
236
237
  Requires-Dist: uvloop; extra == "runtime-common"
237
- Requires-Dist: modelscope; extra == "runtime-common"
238
+ Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
238
239
  Provides-Extra: srt
239
240
  Requires-Dist: sglang[runtime_common]; extra == "srt"
240
241
  Requires-Dist: torch; extra == "srt"
241
242
  Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
243
+ Requires-Dist: cuda-python; extra == "srt"
242
244
  Provides-Extra: srt-hip
243
245
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
244
246
  Requires-Dist: torch; extra == "srt-hip"
245
247
  Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
246
248
  Provides-Extra: srt-xpu
247
249
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
250
+ Provides-Extra: srt-hpu
251
+ Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
248
252
  Provides-Extra: openai
249
253
  Requires-Dist: openai>=1.0; extra == "openai"
250
254
  Requires-Dist: tiktoken; extra == "openai"
@@ -274,6 +278,11 @@ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
274
278
  Requires-Dist: sglang[openai]; extra == "all-xpu"
275
279
  Requires-Dist: sglang[anthropic]; extra == "all-xpu"
276
280
  Requires-Dist: sglang[litellm]; extra == "all-xpu"
281
+ Provides-Extra: all-hpu
282
+ Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
283
+ Requires-Dist: sglang[openai]; extra == "all-hpu"
284
+ Requires-Dist: sglang[anthropic]; extra == "all-hpu"
285
+ Requires-Dist: sglang[litellm]; extra == "all-hpu"
277
286
  Provides-Extra: dev
278
287
  Requires-Dist: sglang[all]; extra == "dev"
279
288
  Requires-Dist: sglang[test]; extra == "dev"
@@ -283,6 +292,9 @@ Requires-Dist: sglang[test]; extra == "dev-hip"
283
292
  Provides-Extra: dev-xpu
284
293
  Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
285
294
  Requires-Dist: sglang[test]; extra == "dev-xpu"
295
+ Provides-Extra: dev-hpu
296
+ Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
297
+ Requires-Dist: sglang[test]; extra == "dev-hpu"
286
298
 
287
299
  <div align="center" id="sglangtop">
288
300
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -321,21 +333,16 @@ SGLang is a fast serving framework for large language models and vision language
321
333
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
322
334
  The core features include:
323
335
 
324
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
336
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
325
337
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
326
338
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
327
339
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
328
340
 
329
341
  ## Getting Started
330
- Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
331
-
332
- Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
333
-
334
- ## Backend: SGLang Runtime (SRT)
335
- See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
336
-
337
- ## Frontend: Structured Generation Language (SGLang)
338
- See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
342
+ - [Install SGLang](https://sgl-project.github.io/start/install.html)
343
+ - [Send requests](https://sgl-project.github.io/start/send_request.html)
344
+ - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
345
+ - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
339
346
 
340
347
  ## Benchmark And Performance
341
348
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -343,6 +350,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
343
350
  ## Roadmap
344
351
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
345
352
 
346
- ## Citation And Acknowledgment
353
+ ## Adoption and Sponsorship
354
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
355
+
356
+ ## Acknowledgment and Citation
357
+ We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
347
358
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
348
- We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -35,21 +35,16 @@ SGLang is a fast serving framework for large language models and vision language
35
35
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
36
36
  The core features include:
37
37
 
38
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
38
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
39
39
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
40
40
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
41
41
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
42
42
 
43
43
  ## Getting Started
44
- Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
45
-
46
- Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
47
-
48
- ## Backend: SGLang Runtime (SRT)
49
- See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
50
-
51
- ## Frontend: Structured Generation Language (SGLang)
52
- See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
44
+ - [Install SGLang](https://sgl-project.github.io/start/install.html)
45
+ - [Send requests](https://sgl-project.github.io/start/send_request.html)
46
+ - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
47
+ - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
53
48
 
54
49
  ## Benchmark And Performance
55
50
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -57,6 +52,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
57
52
  ## Roadmap
58
53
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
59
54
 
60
- ## Citation And Acknowledgment
55
+ ## Adoption and Sponsorship
56
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
57
+
58
+ ## Acknowledgment and Citation
59
+ We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
61
60
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
62
- We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.6"
7
+ version = "0.3.6.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -17,13 +17,13 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
17
17
 
18
18
  [project.optional-dependencies]
19
19
  runtime_common = ["aiohttp", "decord", "fastapi",
20
- "hf_transfer", "huggingface_hub", "interegular",
20
+ "hf_transfer", "huggingface_hub", "interegular", "modelscope",
21
21
  "orjson", "outlines>=0.0.44,<0.1.0",
22
22
  "packaging", "pillow", "prometheus-client>=0.20.0",
23
23
  "psutil", "pydantic", "python-multipart",
24
24
  "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
25
- "modelscope"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
25
+ "xgrammar>=0.1.4"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python"]
27
27
 
28
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -31,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
31
31
  # xpu is not enabled in public vllm and torch whl,
32
32
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
33
33
  srt_xpu = ["sglang[runtime_common]"]
34
+ #For Intel Gaudi(device : hpu) follow the installation guide
35
+ #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
36
+ srt_hpu = ["sglang[runtime_common]"]
34
37
 
35
38
  openai = ["openai>=1.0", "tiktoken"]
36
39
  anthropic = ["anthropic>=0.20.0"]
@@ -46,9 +49,11 @@ test = [
46
49
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
47
50
  all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
48
51
  all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
52
+ all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
49
53
  dev = ["sglang[all]", "sglang[test]"]
50
54
  dev_hip = ["sglang[all_hip]", "sglang[test]"]
51
55
  dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
56
+ dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
52
57
 
53
58
  [project.urls]
54
59
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -11,7 +11,7 @@ from sglang.api import (
11
11
  gen,
12
12
  gen_int,
13
13
  gen_string,
14
- get_server_args,
14
+ get_server_info,
15
15
  image,
16
16
  select,
17
17
  set_default_backend,
@@ -41,7 +41,7 @@ __all__ = [
41
41
  "gen",
42
42
  "gen_int",
43
43
  "gen_string",
44
- "get_server_args",
44
+ "get_server_info",
45
45
  "image",
46
46
  "select",
47
47
  "set_default_backend",
@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
65
65
  return backend.flush_cache()
66
66
 
67
67
 
68
- def get_server_args(backend: Optional[BaseBackend] = None):
68
+ def get_server_info(backend: Optional[BaseBackend] = None):
69
69
  backend = backend or global_config.default_backend
70
70
  if backend is None:
71
71
  return None
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
73
73
  # If backend is Runtime
74
74
  if hasattr(backend, "endpoint"):
75
75
  backend = backend.endpoint
76
- return backend.get_server_args()
76
+ return backend.get_server_info()
77
77
 
78
78
 
79
79
  def gen(
@@ -212,6 +212,7 @@ def extend(reqs, model_runner):
212
212
  token_to_kv_pool=model_runner.token_to_kv_pool,
213
213
  tree_cache=None,
214
214
  model_config=model_runner.model_config,
215
+ enable_overlap=False,
215
216
  )
216
217
  batch.prepare_for_extend()
217
218
  model_worker_batch = batch.get_model_worker_batch()
@@ -278,10 +279,7 @@ def correctness_test(
278
279
 
279
280
 
280
281
  def synchronize(device):
281
- if device == "cuda":
282
- torch.cuda.synchronize()
283
- elif device == "xpu":
284
- torch.xpu.synchronize()
282
+ torch.get_device_module(device).synchronize()
285
283
 
286
284
 
287
285
  def latency_test_run_once(
@@ -468,7 +466,6 @@ if __name__ == "__main__":
468
466
 
469
467
  try:
470
468
  main(server_args, bench_args)
471
- except Exception as e:
472
- raise e
473
469
  finally:
474
- kill_child_process()
470
+ if server_args.tp_size != 1:
471
+ kill_child_process()
@@ -5,9 +5,9 @@ This script launches a server and uses the HTTP interface.
5
5
  It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
6
6
 
7
7
  Usage:
8
- python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
8
+ python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
9
9
 
10
- python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
10
+ python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
11
11
  """
12
12
 
13
13
  import argparse
@@ -25,6 +25,7 @@ import warnings
25
25
  from argparse import ArgumentParser
26
26
  from dataclasses import dataclass, field
27
27
  from datetime import datetime
28
+ from pathlib import Path
28
29
  from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
29
30
 
30
31
  import aiohttp
@@ -407,7 +408,7 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
407
408
 
408
409
 
409
410
  def get_model(pretrained_model_name_or_path: str) -> str:
410
- if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true":
411
+ if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
411
412
  import huggingface_hub.constants
412
413
  from modelscope import snapshot_download
413
414
 
@@ -693,6 +694,19 @@ def gen_prompt(tokenizer, token_num):
693
694
  return tokenizer.decode(selected_tokens)
694
695
 
695
696
 
697
+ def get_gen_prefix_cache_path(args, tokenizer):
698
+ """Create cache directory under ~/.cache/sglang/benchmark"""
699
+ cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
700
+
701
+ # Create a unique cache filename based on the generation parameters
702
+ cache_key = (
703
+ f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
704
+ f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
705
+ f"{tokenizer.__class__.__name__}.pkl"
706
+ )
707
+ return cache_dir / cache_key
708
+
709
+
696
710
  def sample_generated_shared_prefix_requests(
697
711
  num_groups: int,
698
712
  prompts_per_group: int,
@@ -701,12 +715,17 @@ def sample_generated_shared_prefix_requests(
701
715
  output_len: int,
702
716
  tokenizer: PreTrainedTokenizerBase,
703
717
  ) -> List[Tuple[str, int, int]]:
704
- if args.generated_input_path and os.path.exists(args.generated_input_path):
705
- print(f"\nloading generated input data from {args.generated_input_path}")
706
- with open(args.generated_input_path, "rb") as f:
718
+ """Generate benchmark requests with shared system prompts using random tokens and caching."""
719
+ cache_path = get_gen_prefix_cache_path(args, tokenizer)
720
+
721
+ # Try to load from cache first
722
+ if cache_path.exists():
723
+ print(f"\nLoading cached generated input data from {cache_path}")
724
+ with open(cache_path, "rb") as f:
707
725
  return pickle.load(f)
708
726
 
709
- """Generate benchmark requests with shared system prompts using random tokens."""
727
+ print("\nGenerating new input data...")
728
+
710
729
  # Generate system prompts for each group
711
730
  system_prompts = []
712
731
  for _ in range(num_groups):
@@ -719,17 +738,16 @@ def sample_generated_shared_prefix_requests(
719
738
  question = gen_prompt(tokenizer, question_len)
720
739
  questions.append(question)
721
740
 
722
- # Shuffle questions
723
- random.shuffle(questions)
724
-
725
741
  # Combine system prompts with questions
726
742
  input_requests = []
727
743
  total_input_tokens = 0
728
744
  total_output_tokens = 0
729
745
 
730
- for group_idx in range(num_groups):
746
+ for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
731
747
  system_prompt = system_prompts[group_idx]
732
- for prompt_idx in range(prompts_per_group):
748
+ for prompt_idx in tqdm(
749
+ range(prompts_per_group), desc="Generating questions", leave=False
750
+ ):
733
751
  question = questions[group_idx * prompts_per_group + prompt_idx]
734
752
  full_prompt = f"{system_prompt}\n\n{question}"
735
753
  prompt_len = len(tokenizer.encode(full_prompt))
@@ -738,6 +756,10 @@ def sample_generated_shared_prefix_requests(
738
756
  total_input_tokens += prompt_len
739
757
  total_output_tokens += output_len
740
758
 
759
+ # Shuffle questions
760
+ random.shuffle(input_requests)
761
+
762
+ # Print statistics
741
763
  print(f"\nGenerated shared prefix dataset statistics:")
742
764
  print(f"Number of groups: {num_groups}")
743
765
  print(f"Prompts per group: {prompts_per_group}")
@@ -750,11 +772,12 @@ def sample_generated_shared_prefix_requests(
750
772
  print(
751
773
  f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
752
774
  )
753
- if args.generated_input_save_path:
754
- print(f"Saving generated input data to {args.generated_input_save_path}")
755
- os.makedirs(os.path.dirname(args.generated_input_save_path), exist_ok=True)
756
- with open(args.generated_input_save_path, "wb") as f:
757
- pickle.dump(input_requests, f)
775
+
776
+ # Save to cache
777
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
778
+ print(f"Caching generated input data to {cache_path}")
779
+ with open(cache_path, "wb") as f:
780
+ pickle.dump(input_requests, f)
758
781
 
759
782
  return input_requests
760
783
 
@@ -859,6 +882,7 @@ async def benchmark(
859
882
  tokenizer: PreTrainedTokenizerBase,
860
883
  input_requests: List[Tuple[str, int, int]],
861
884
  request_rate: float,
885
+ max_concurrency: Optional[int],
862
886
  disable_tqdm: bool,
863
887
  extra_request_body: Dict[str, Any],
864
888
  profile: bool,
@@ -868,6 +892,15 @@ async def benchmark(
868
892
  else:
869
893
  raise ValueError(f"Unknown backend: {backend}")
870
894
 
895
+ # From https://github.com/vllm-project/vllm/pull/9390
896
+ semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
897
+
898
+ async def limited_request_func(request_func_input, pbar):
899
+ if semaphore is None:
900
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
901
+ async with semaphore:
902
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
903
+
871
904
  print("Starting initial single prompt test run...")
872
905
  test_prompt, test_prompt_len, test_output_len = input_requests[0]
873
906
  test_input = RequestFuncInput(
@@ -913,7 +946,7 @@ async def benchmark(
913
946
  )
914
947
  tasks.append(
915
948
  asyncio.create_task(
916
- request_func(request_func_input=request_func_input, pbar=pbar)
949
+ limited_request_func(request_func_input=request_func_input, pbar=pbar)
917
950
  )
918
951
  )
919
952
  outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -940,6 +973,12 @@ async def benchmark(
940
973
  print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
941
974
  print("{:<40} {:<10}".format("Backend:", backend))
942
975
  print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
976
+ print(
977
+ "{:<40} {:<10}".format(
978
+ "Max reqeuest concurrency:",
979
+ max_concurrency if max_concurrency else "not set",
980
+ )
981
+ )
943
982
  print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
944
983
  print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
945
984
  print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
@@ -1003,6 +1042,7 @@ async def benchmark(
1003
1042
  "backend": args.backend,
1004
1043
  "dataset_name": args.dataset_name,
1005
1044
  "request_rate": request_rate,
1045
+ "max_concurrency": max_concurrency,
1006
1046
  "total_input_tokens": metrics.total_input,
1007
1047
  "total_output_tokens": metrics.total_output,
1008
1048
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
@@ -1090,6 +1130,10 @@ def run_benchmark(args_: argparse.Namespace):
1090
1130
  global args
1091
1131
  args = args_
1092
1132
 
1133
+ # Set default value for max_concurrency if not present
1134
+ if not hasattr(args, "max_concurrency"):
1135
+ args.max_concurrency = None
1136
+
1093
1137
  # Set global environments
1094
1138
  set_ulimit()
1095
1139
  random.seed(args.seed)
@@ -1201,6 +1245,7 @@ def run_benchmark(args_: argparse.Namespace):
1201
1245
  tokenizer=tokenizer,
1202
1246
  input_requests=input_requests,
1203
1247
  request_rate=args.request_rate,
1248
+ max_concurrency=args.max_concurrency,
1204
1249
  disable_tqdm=args.disable_tqdm,
1205
1250
  extra_request_body=extra_request_body,
1206
1251
  profile=args.profile,
@@ -1220,6 +1265,7 @@ def run_benchmark(args_: argparse.Namespace):
1220
1265
  tokenizer=tokenizer,
1221
1266
  input_requests=input_requests,
1222
1267
  request_rate=rate,
1268
+ max_concurrency=args.max_concurrency,
1223
1269
  disable_tqdm=args.disable_tqdm,
1224
1270
  extra_request_body=extra_request_body,
1225
1271
  profile=args.profile,
@@ -1319,6 +1365,19 @@ if __name__ == "__main__":
1319
1365
  help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
1320
1366
  "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
1321
1367
  )
1368
+ parser.add_argument(
1369
+ "--max-concurrency",
1370
+ type=int,
1371
+ default=None,
1372
+ help="Maximum number of concurrent requests. This can be used "
1373
+ "to help simulate an environment where a higher level component "
1374
+ "is enforcing a maximum number of concurrent requests. While the "
1375
+ "--request-rate argument controls the rate at which requests are "
1376
+ "initiated, this argument will control how many are actually allowed "
1377
+ "to execute at a time. This means that when used in combination, the "
1378
+ "actual request rate may be lower than specified with --request-rate, "
1379
+ "if the server is not processing requests fast enough to keep up.",
1380
+ )
1322
1381
  parser.add_argument("--seed", type=int, default=1, help="The random seed.")
1323
1382
  parser.add_argument(
1324
1383
  "--multi",
@@ -1386,16 +1445,6 @@ if __name__ == "__main__":
1386
1445
  default=256,
1387
1446
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
1388
1447
  )
1389
- parser.add_argument(
1390
- "--generated-input-save-path",
1391
- type=str,
1392
- help="Path to save generated input data",
1393
- )
1394
- parser.add_argument(
1395
- "--generated-input-path",
1396
- type=str,
1397
- help="Path to load previously generated input data",
1398
- )
1399
1448
  parser.add_argument(
1400
1449
  "--profile",
1401
1450
  action="store_true",
@@ -22,18 +22,24 @@ PACKAGE_LIST = [
22
22
  "hf_transfer",
23
23
  "huggingface_hub",
24
24
  "interegular",
25
+ "modelscope",
26
+ "orjson",
27
+ "outlines",
28
+ "packaging",
25
29
  "psutil",
26
30
  "pydantic",
27
31
  "multipart",
28
32
  "zmq",
33
+ "torchao",
29
34
  "uvicorn",
30
35
  "uvloop",
31
36
  "vllm",
32
- "outlines",
37
+ "xgrammar",
33
38
  "openai",
34
39
  "tiktoken",
35
40
  "anthropic",
36
41
  "litellm",
42
+ "decord",
37
43
  ]
38
44
 
39
45
 
@@ -78,5 +78,5 @@ class BaseBackend:
78
78
  def flush_cache(self):
79
79
  pass
80
80
 
81
- def get_server_args(self):
81
+ def get_server_info(self):
82
82
  pass
@@ -58,9 +58,9 @@ class RuntimeEndpoint(BaseBackend):
58
58
  )
59
59
  self._assert_success(res)
60
60
 
61
- def get_server_args(self):
61
+ def get_server_info(self):
62
62
  res = http_request(
63
- self.base_url + "/get_server_args",
63
+ self.base_url + "/get_server_info",
64
64
  api_key=self.api_key,
65
65
  verify=self.verify,
66
66
  )
@@ -278,6 +278,6 @@ class TracingScope:
278
278
 
279
279
  def add_child_state(self, state: TracerProgramState):
280
280
  cur_scope = self
281
- while cur_scope != None:
281
+ while cur_scope is not None:
282
282
  cur_scope.tracer_state.child_states.append(state)
283
283
  cur_scope = cur_scope.last_scope
@@ -1,6 +1,5 @@
1
1
  """Launch the inference server."""
2
2
 
3
- import os
4
3
  import sys
5
4
 
6
5
  from sglang.srt.server import launch_server
@@ -12,7 +11,5 @@ if __name__ == "__main__":
12
11
 
13
12
  try:
14
13
  launch_server(server_args)
15
- except Exception as e:
16
- raise e
17
14
  finally:
18
15
  kill_child_process()
@@ -1,27 +1,26 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
15
14
 
16
15
  import json
17
16
  import logging
18
- import os
19
17
  from enum import IntEnum, auto
20
18
  from typing import List, Optional
21
19
 
22
20
  from transformers import PretrainedConfig
23
21
 
24
22
  from sglang.srt.hf_transformers_utils import get_config, get_context_length
23
+ from sglang.srt.utils import get_bool_env_var
25
24
 
26
25
  logger = logging.getLogger(__name__)
27
26
 
@@ -60,13 +59,9 @@ class ModelConfig:
60
59
 
61
60
  # Derive context length
62
61
  derived_context_len = get_context_length(self.hf_text_config)
63
- allow_long_context = os.environ.get(
64
- "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
65
- )
66
-
67
62
  if context_length is not None:
68
63
  if context_length > derived_context_len:
69
- if allow_long_context:
64
+ if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
70
65
  logger.warning(
71
66
  f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
72
67
  f"This may lead to incorrect model outputs or CUDA errors."