sglang 0.3.6__tar.gz → 0.3.6.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. {sglang-0.3.6 → sglang-0.3.6.post1}/LICENSE +1 -1
  2. {sglang-0.3.6 → sglang-0.3.6.post1}/PKG-INFO +24 -15
  3. {sglang-0.3.6 → sglang-0.3.6.post1}/README.md +10 -12
  4. {sglang-0.3.6 → sglang-0.3.6.post1}/pyproject.toml +8 -3
  5. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/__init__.py +2 -2
  6. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/api.py +2 -2
  7. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/bench_one_batch.py +2 -4
  8. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/bench_serving.py +75 -26
  9. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/backend/base_backend.py +1 -1
  10. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/backend/runtime_endpoint.py +2 -2
  11. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/configs/model_config.py +13 -14
  12. sglang-0.3.6.post1/sglang/srt/constrained/__init__.py +16 -0
  13. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/constrained/base_grammar_backend.py +13 -15
  14. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/constrained/outlines_backend.py +13 -15
  15. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/constrained/outlines_jump_forward.py +13 -15
  16. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/constrained/xgrammar_backend.py +38 -57
  17. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/conversation.py +13 -15
  18. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/hf_transformers_utils.py +13 -15
  19. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/activation.py +13 -13
  20. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/attention/flashinfer_backend.py +13 -6
  21. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
  22. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
  23. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
  24. sglang-0.3.6.post1/sglang/srt/layers/custom_op_util.py +25 -0
  25. sglang-0.3.6.post1/sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
  26. {sglang-0.3.6/sglang/srt/layers/fused_moe → sglang-0.3.6.post1/sglang/srt/layers/fused_moe_grok}/layer.py +4 -9
  27. sglang-0.3.6/sglang/srt/layers/fused_moe/patch.py → sglang-0.3.6.post1/sglang/srt/layers/fused_moe_patch.py +5 -0
  28. sglang-0.3.6.post1/sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
  29. sglang-0.3.6.post1/sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
  30. sglang-0.3.6.post1/sglang/srt/layers/fused_moe_triton/layer.py +633 -0
  31. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/layernorm.py +13 -15
  32. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/logits_processor.py +13 -15
  33. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/quantization/__init__.py +77 -17
  34. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/radix_attention.py +13 -15
  35. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/rotary_embedding.py +13 -13
  36. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/lora/lora.py +13 -14
  37. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/lora/lora_config.py +13 -14
  38. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/lora/lora_manager.py +22 -24
  39. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/data_parallel_controller.py +25 -19
  40. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/detokenizer_manager.py +13 -16
  41. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/io_struct.py +43 -28
  42. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/schedule_batch.py +55 -26
  43. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/schedule_policy.py +13 -15
  44. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/scheduler.py +89 -70
  45. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/session_controller.py +14 -15
  46. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/tokenizer_manager.py +29 -22
  47. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/tp_worker.py +13 -15
  48. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
  49. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/metrics/collector.py +13 -15
  50. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/metrics/func_timer.py +13 -15
  51. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/mm_utils.py +13 -14
  52. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/model_executor/cuda_graph_runner.py +20 -19
  53. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/model_executor/forward_batch_info.py +19 -17
  54. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/model_executor/model_runner.py +42 -30
  55. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/chatglm.py +15 -16
  56. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/commandr.py +15 -16
  57. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/dbrx.py +15 -16
  58. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/deepseek.py +15 -15
  59. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/deepseek_v2.py +15 -15
  60. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/exaone.py +14 -15
  61. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/gemma.py +14 -14
  62. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/gemma2.py +24 -19
  63. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/gemma2_reward.py +13 -14
  64. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/gpt_bigcode.py +14 -14
  65. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/grok.py +15 -15
  66. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/internlm2.py +13 -15
  67. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/internlm2_reward.py +13 -14
  68. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/llama.py +21 -21
  69. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/llama_classification.py +13 -14
  70. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/llama_reward.py +13 -14
  71. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/llava.py +13 -15
  72. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/llavavid.py +13 -15
  73. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/minicpm.py +13 -15
  74. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/minicpm3.py +13 -15
  75. sglang-0.3.6.post1/sglang/srt/models/mistral.py +23 -0
  76. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/mixtral.py +15 -15
  77. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/mixtral_quant.py +14 -14
  78. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/olmo.py +21 -19
  79. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/olmoe.py +23 -20
  80. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/qwen.py +14 -14
  81. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/qwen2.py +22 -19
  82. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/qwen2_moe.py +17 -18
  83. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/stablelm.py +18 -16
  84. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/torch_native_llama.py +15 -17
  85. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/xverse.py +13 -14
  86. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/xverse_moe.py +15 -16
  87. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/yivl.py +13 -15
  88. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/openai_api/adapter.py +13 -15
  89. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/openai_api/protocol.py +13 -15
  90. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/sampling/sampling_batch_info.py +4 -1
  91. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/sampling/sampling_params.py +13 -15
  92. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/server.py +59 -34
  93. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/server_args.py +22 -22
  94. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/utils.py +196 -17
  95. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/few_shot_gsm8k.py +8 -4
  96. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/runners.py +13 -14
  97. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/test_utils.py +1 -1
  98. sglang-0.3.6.post1/sglang/version.py +1 -0
  99. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang.egg-info/PKG-INFO +24 -15
  100. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang.egg-info/SOURCES.txt +7 -4
  101. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang.egg-info/requires.txt +15 -1
  102. sglang-0.3.6/sglang/srt/constrained/__init__.py +0 -17
  103. sglang-0.3.6/sglang/srt/layers/custom_op_util.py +0 -26
  104. sglang-0.3.6/sglang/srt/layers/fused_moe/__init__.py +0 -1
  105. sglang-0.3.6/sglang/srt/models/mistral.py +0 -25
  106. sglang-0.3.6/sglang/version.py +0 -1
  107. {sglang-0.3.6 → sglang-0.3.6.post1}/setup.cfg +0 -0
  108. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/bench_latency.py +0 -0
  109. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/bench_offline_throughput.py +0 -0
  110. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/bench_one_batch_server.py +0 -0
  111. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/check_env.py +0 -0
  112. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/global_config.py +0 -0
  113. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/__init__.py +0 -0
  114. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/backend/__init__.py +0 -0
  115. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/backend/anthropic.py +0 -0
  116. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/backend/litellm.py +0 -0
  117. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/backend/openai.py +0 -0
  118. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/backend/vertexai.py +0 -0
  119. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/chat_template.py +0 -0
  120. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/choices.py +0 -0
  121. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/compiler.py +0 -0
  122. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/interpreter.py +0 -0
  123. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/ir.py +0 -0
  124. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/lang/tracer.py +0 -0
  125. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/launch_server.py +0 -0
  126. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/launch_server_llavavid.py +0 -0
  127. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/configs/__init__.py +0 -0
  128. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/configs/exaone.py +0 -0
  129. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/configs/qwen2vl.py +0 -0
  130. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/attention/__init__.py +0 -0
  131. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  132. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  133. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  134. {sglang-0.3.6/sglang/srt/layers/fused_moe → sglang-0.3.6.post1/sglang/srt/layers/fused_moe_grok}/fused_moe.py +0 -0
  135. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/linear.py +0 -0
  136. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/pooler.py +0 -0
  137. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  138. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/sampler.py +0 -0
  139. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  140. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  141. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/managers/image_processor.py +0 -0
  142. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  143. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  144. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  145. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
  146. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  147. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/model_parallel.py +0 -0
  148. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/baichuan.py +0 -0
  149. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/gpt2.py +0 -0
  150. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/llama_embedding.py +0 -0
  151. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/mllama.py +0 -0
  152. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/phi3_small.py +0 -0
  153. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/models/qwen2_vl.py +0 -0
  154. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  155. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  156. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  157. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  158. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  159. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  160. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  161. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/run_eval.py +0 -0
  162. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/simple_eval_common.py +0 -0
  163. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  164. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  165. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/simple_eval_math.py +0 -0
  166. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  167. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  168. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  169. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/test_activation.py +0 -0
  170. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/test_layernorm.py +0 -0
  171. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/test/test_programs.py +0 -0
  172. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang/utils.py +0 -0
  173. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang.egg-info/dependency_links.txt +0 -0
  174. {sglang-0.3.6 → sglang-0.3.6.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -186,7 +186,7 @@
186
186
  same "printed page" as the copyright notice for easier
187
187
  identification within third-party archives.
188
188
 
189
- Copyright [yyyy] [name of copyright owner]
189
+ Copyright 2023-2024 SGLang Team
190
190
 
191
191
  Licensed under the Apache License, Version 2.0 (the "License");
192
192
  you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.6
3
+ Version: 0.3.6.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -190,7 +190,7 @@ License: Apache License
190
190
  same "printed page" as the copyright notice for easier
191
191
  identification within third-party archives.
192
192
 
193
- Copyright [yyyy] [name of copyright owner]
193
+ Copyright 2023-2024 SGLang Team
194
194
 
195
195
  Licensed under the Apache License, Version 2.0 (the "License");
196
196
  you may not use this file except in compliance with the License.
@@ -222,6 +222,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
222
222
  Requires-Dist: hf_transfer; extra == "runtime-common"
223
223
  Requires-Dist: huggingface_hub; extra == "runtime-common"
224
224
  Requires-Dist: interegular; extra == "runtime-common"
225
+ Requires-Dist: modelscope; extra == "runtime-common"
225
226
  Requires-Dist: orjson; extra == "runtime-common"
226
227
  Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
227
228
  Requires-Dist: packaging; extra == "runtime-common"
@@ -234,7 +235,7 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
234
235
  Requires-Dist: torchao; extra == "runtime-common"
235
236
  Requires-Dist: uvicorn; extra == "runtime-common"
236
237
  Requires-Dist: uvloop; extra == "runtime-common"
237
- Requires-Dist: modelscope; extra == "runtime-common"
238
+ Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
238
239
  Provides-Extra: srt
239
240
  Requires-Dist: sglang[runtime_common]; extra == "srt"
240
241
  Requires-Dist: torch; extra == "srt"
@@ -245,6 +246,8 @@ Requires-Dist: torch; extra == "srt-hip"
245
246
  Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
246
247
  Provides-Extra: srt-xpu
247
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
249
+ Provides-Extra: srt-hpu
250
+ Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
248
251
  Provides-Extra: openai
249
252
  Requires-Dist: openai>=1.0; extra == "openai"
250
253
  Requires-Dist: tiktoken; extra == "openai"
@@ -274,6 +277,11 @@ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
274
277
  Requires-Dist: sglang[openai]; extra == "all-xpu"
275
278
  Requires-Dist: sglang[anthropic]; extra == "all-xpu"
276
279
  Requires-Dist: sglang[litellm]; extra == "all-xpu"
280
+ Provides-Extra: all-hpu
281
+ Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
282
+ Requires-Dist: sglang[openai]; extra == "all-hpu"
283
+ Requires-Dist: sglang[anthropic]; extra == "all-hpu"
284
+ Requires-Dist: sglang[litellm]; extra == "all-hpu"
277
285
  Provides-Extra: dev
278
286
  Requires-Dist: sglang[all]; extra == "dev"
279
287
  Requires-Dist: sglang[test]; extra == "dev"
@@ -283,6 +291,9 @@ Requires-Dist: sglang[test]; extra == "dev-hip"
283
291
  Provides-Extra: dev-xpu
284
292
  Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
285
293
  Requires-Dist: sglang[test]; extra == "dev-xpu"
294
+ Provides-Extra: dev-hpu
295
+ Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
296
+ Requires-Dist: sglang[test]; extra == "dev-hpu"
286
297
 
287
298
  <div align="center" id="sglangtop">
288
299
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -321,21 +332,16 @@ SGLang is a fast serving framework for large language models and vision language
321
332
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
322
333
  The core features include:
323
334
 
324
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
335
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
325
336
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
326
337
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
327
338
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
328
339
 
329
340
  ## Getting Started
330
- Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
331
-
332
- Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
333
-
334
- ## Backend: SGLang Runtime (SRT)
335
- See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
336
-
337
- ## Frontend: Structured Generation Language (SGLang)
338
- See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
341
+ - [Install SGLang](https://sgl-project.github.io/start/install.html)
342
+ - [Send requests](https://sgl-project.github.io/start/send_request.html)
343
+ - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
344
+ - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
339
345
 
340
346
  ## Benchmark And Performance
341
347
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -343,6 +349,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
343
349
  ## Roadmap
344
350
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
345
351
 
346
- ## Citation And Acknowledgment
352
+ ## Adoption and Sponsorship
353
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
354
+
355
+ ## Acknowledgment and Citation
356
+ We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
347
357
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
348
- We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -35,21 +35,16 @@ SGLang is a fast serving framework for large language models and vision language
35
35
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
36
36
  The core features include:
37
37
 
38
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
38
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
39
39
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
40
40
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
41
41
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
42
42
 
43
43
  ## Getting Started
44
- Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
45
-
46
- Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
47
-
48
- ## Backend: SGLang Runtime (SRT)
49
- See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
50
-
51
- ## Frontend: Structured Generation Language (SGLang)
52
- See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
44
+ - [Install SGLang](https://sgl-project.github.io/start/install.html)
45
+ - [Send requests](https://sgl-project.github.io/start/send_request.html)
46
+ - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
47
+ - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
53
48
 
54
49
  ## Benchmark And Performance
55
50
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -57,6 +52,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
57
52
  ## Roadmap
58
53
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
59
54
 
60
- ## Citation And Acknowledgment
55
+ ## Adoption and Sponsorship
56
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
57
+
58
+ ## Acknowledgment and Citation
59
+ We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
61
60
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
62
- We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.6"
7
+ version = "0.3.6.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -17,12 +17,12 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
17
17
 
18
18
  [project.optional-dependencies]
19
19
  runtime_common = ["aiohttp", "decord", "fastapi",
20
- "hf_transfer", "huggingface_hub", "interegular",
20
+ "hf_transfer", "huggingface_hub", "interegular", "modelscope",
21
21
  "orjson", "outlines>=0.0.44,<0.1.0",
22
22
  "packaging", "pillow", "prometheus-client>=0.20.0",
23
23
  "psutil", "pydantic", "python-multipart",
24
24
  "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
25
- "modelscope"]
25
+ "xgrammar>=0.1.4"]
26
26
  srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
27
27
 
28
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
@@ -31,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
31
31
  # xpu is not enabled in public vllm and torch whl,
32
32
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
33
33
  srt_xpu = ["sglang[runtime_common]"]
34
+ #For Intel Gaudi(device : hpu) follow the installation guide
35
+ #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
36
+ srt_hpu = ["sglang[runtime_common]"]
34
37
 
35
38
  openai = ["openai>=1.0", "tiktoken"]
36
39
  anthropic = ["anthropic>=0.20.0"]
@@ -46,9 +49,11 @@ test = [
46
49
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
47
50
  all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
48
51
  all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
52
+ all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
49
53
  dev = ["sglang[all]", "sglang[test]"]
50
54
  dev_hip = ["sglang[all_hip]", "sglang[test]"]
51
55
  dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
56
+ dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
52
57
 
53
58
  [project.urls]
54
59
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -11,7 +11,7 @@ from sglang.api import (
11
11
  gen,
12
12
  gen_int,
13
13
  gen_string,
14
- get_server_args,
14
+ get_server_info,
15
15
  image,
16
16
  select,
17
17
  set_default_backend,
@@ -41,7 +41,7 @@ __all__ = [
41
41
  "gen",
42
42
  "gen_int",
43
43
  "gen_string",
44
- "get_server_args",
44
+ "get_server_info",
45
45
  "image",
46
46
  "select",
47
47
  "set_default_backend",
@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
65
65
  return backend.flush_cache()
66
66
 
67
67
 
68
- def get_server_args(backend: Optional[BaseBackend] = None):
68
+ def get_server_info(backend: Optional[BaseBackend] = None):
69
69
  backend = backend or global_config.default_backend
70
70
  if backend is None:
71
71
  return None
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
73
73
  # If backend is Runtime
74
74
  if hasattr(backend, "endpoint"):
75
75
  backend = backend.endpoint
76
- return backend.get_server_args()
76
+ return backend.get_server_info()
77
77
 
78
78
 
79
79
  def gen(
@@ -212,6 +212,7 @@ def extend(reqs, model_runner):
212
212
  token_to_kv_pool=model_runner.token_to_kv_pool,
213
213
  tree_cache=None,
214
214
  model_config=model_runner.model_config,
215
+ enable_overlap=False,
215
216
  )
216
217
  batch.prepare_for_extend()
217
218
  model_worker_batch = batch.get_model_worker_batch()
@@ -278,10 +279,7 @@ def correctness_test(
278
279
 
279
280
 
280
281
  def synchronize(device):
281
- if device == "cuda":
282
- torch.cuda.synchronize()
283
- elif device == "xpu":
284
- torch.xpu.synchronize()
282
+ torch.get_device_module(device).synchronize()
285
283
 
286
284
 
287
285
  def latency_test_run_once(
@@ -25,6 +25,7 @@ import warnings
25
25
  from argparse import ArgumentParser
26
26
  from dataclasses import dataclass, field
27
27
  from datetime import datetime
28
+ from pathlib import Path
28
29
  from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
29
30
 
30
31
  import aiohttp
@@ -407,7 +408,7 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
407
408
 
408
409
 
409
410
  def get_model(pretrained_model_name_or_path: str) -> str:
410
- if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true":
411
+ if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
411
412
  import huggingface_hub.constants
412
413
  from modelscope import snapshot_download
413
414
 
@@ -693,6 +694,19 @@ def gen_prompt(tokenizer, token_num):
693
694
  return tokenizer.decode(selected_tokens)
694
695
 
695
696
 
697
+ def get_gen_prefix_cache_path(args, tokenizer):
698
+ """Create cache directory under ~/.cache/sglang/benchmark"""
699
+ cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
700
+
701
+ # Create a unique cache filename based on the generation parameters
702
+ cache_key = (
703
+ f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
704
+ f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
705
+ f"{tokenizer.__class__.__name__}.pkl"
706
+ )
707
+ return cache_dir / cache_key
708
+
709
+
696
710
  def sample_generated_shared_prefix_requests(
697
711
  num_groups: int,
698
712
  prompts_per_group: int,
@@ -701,12 +715,17 @@ def sample_generated_shared_prefix_requests(
701
715
  output_len: int,
702
716
  tokenizer: PreTrainedTokenizerBase,
703
717
  ) -> List[Tuple[str, int, int]]:
704
- if args.generated_input_path and os.path.exists(args.generated_input_path):
705
- print(f"\nloading generated input data from {args.generated_input_path}")
706
- with open(args.generated_input_path, "rb") as f:
718
+ """Generate benchmark requests with shared system prompts using random tokens and caching."""
719
+ cache_path = get_gen_prefix_cache_path(args, tokenizer)
720
+
721
+ # Try to load from cache first
722
+ if cache_path.exists():
723
+ print(f"\nLoading cached generated input data from {cache_path}")
724
+ with open(cache_path, "rb") as f:
707
725
  return pickle.load(f)
708
726
 
709
- """Generate benchmark requests with shared system prompts using random tokens."""
727
+ print("\nGenerating new input data...")
728
+
710
729
  # Generate system prompts for each group
711
730
  system_prompts = []
712
731
  for _ in range(num_groups):
@@ -719,17 +738,16 @@ def sample_generated_shared_prefix_requests(
719
738
  question = gen_prompt(tokenizer, question_len)
720
739
  questions.append(question)
721
740
 
722
- # Shuffle questions
723
- random.shuffle(questions)
724
-
725
741
  # Combine system prompts with questions
726
742
  input_requests = []
727
743
  total_input_tokens = 0
728
744
  total_output_tokens = 0
729
745
 
730
- for group_idx in range(num_groups):
746
+ for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
731
747
  system_prompt = system_prompts[group_idx]
732
- for prompt_idx in range(prompts_per_group):
748
+ for prompt_idx in tqdm(
749
+ range(prompts_per_group), desc="Generating questions", leave=False
750
+ ):
733
751
  question = questions[group_idx * prompts_per_group + prompt_idx]
734
752
  full_prompt = f"{system_prompt}\n\n{question}"
735
753
  prompt_len = len(tokenizer.encode(full_prompt))
@@ -738,6 +756,10 @@ def sample_generated_shared_prefix_requests(
738
756
  total_input_tokens += prompt_len
739
757
  total_output_tokens += output_len
740
758
 
759
+ # Shuffle questions
760
+ random.shuffle(input_requests)
761
+
762
+ # Print statistics
741
763
  print(f"\nGenerated shared prefix dataset statistics:")
742
764
  print(f"Number of groups: {num_groups}")
743
765
  print(f"Prompts per group: {prompts_per_group}")
@@ -750,11 +772,12 @@ def sample_generated_shared_prefix_requests(
750
772
  print(
751
773
  f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
752
774
  )
753
- if args.generated_input_save_path:
754
- print(f"Saving generated input data to {args.generated_input_save_path}")
755
- os.makedirs(os.path.dirname(args.generated_input_save_path), exist_ok=True)
756
- with open(args.generated_input_save_path, "wb") as f:
757
- pickle.dump(input_requests, f)
775
+
776
+ # Save to cache
777
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
778
+ print(f"Caching generated input data to {cache_path}")
779
+ with open(cache_path, "wb") as f:
780
+ pickle.dump(input_requests, f)
758
781
 
759
782
  return input_requests
760
783
 
@@ -859,6 +882,7 @@ async def benchmark(
859
882
  tokenizer: PreTrainedTokenizerBase,
860
883
  input_requests: List[Tuple[str, int, int]],
861
884
  request_rate: float,
885
+ max_concurrency: Optional[int],
862
886
  disable_tqdm: bool,
863
887
  extra_request_body: Dict[str, Any],
864
888
  profile: bool,
@@ -868,6 +892,15 @@ async def benchmark(
868
892
  else:
869
893
  raise ValueError(f"Unknown backend: {backend}")
870
894
 
895
+ # From https://github.com/vllm-project/vllm/pull/9390
896
+ semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
897
+
898
+ async def limited_request_func(request_func_input, pbar):
899
+ if semaphore is None:
900
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
901
+ async with semaphore:
902
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
903
+
871
904
  print("Starting initial single prompt test run...")
872
905
  test_prompt, test_prompt_len, test_output_len = input_requests[0]
873
906
  test_input = RequestFuncInput(
@@ -913,7 +946,7 @@ async def benchmark(
913
946
  )
914
947
  tasks.append(
915
948
  asyncio.create_task(
916
- request_func(request_func_input=request_func_input, pbar=pbar)
949
+ limited_request_func(request_func_input=request_func_input, pbar=pbar)
917
950
  )
918
951
  )
919
952
  outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -940,6 +973,12 @@ async def benchmark(
940
973
  print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
941
974
  print("{:<40} {:<10}".format("Backend:", backend))
942
975
  print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
976
+ print(
977
+ "{:<40} {:<10}".format(
978
+ "Max reqeuest concurrency:",
979
+ max_concurrency if max_concurrency else "not set",
980
+ )
981
+ )
943
982
  print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
944
983
  print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
945
984
  print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
@@ -1003,6 +1042,7 @@ async def benchmark(
1003
1042
  "backend": args.backend,
1004
1043
  "dataset_name": args.dataset_name,
1005
1044
  "request_rate": request_rate,
1045
+ "max_concurrency": max_concurrency,
1006
1046
  "total_input_tokens": metrics.total_input,
1007
1047
  "total_output_tokens": metrics.total_output,
1008
1048
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
@@ -1090,6 +1130,10 @@ def run_benchmark(args_: argparse.Namespace):
1090
1130
  global args
1091
1131
  args = args_
1092
1132
 
1133
+ # Set default value for max_concurrency if not present
1134
+ if not hasattr(args, "max_concurrency"):
1135
+ args.max_concurrency = None
1136
+
1093
1137
  # Set global environments
1094
1138
  set_ulimit()
1095
1139
  random.seed(args.seed)
@@ -1201,6 +1245,7 @@ def run_benchmark(args_: argparse.Namespace):
1201
1245
  tokenizer=tokenizer,
1202
1246
  input_requests=input_requests,
1203
1247
  request_rate=args.request_rate,
1248
+ max_concurrency=args.max_concurrency,
1204
1249
  disable_tqdm=args.disable_tqdm,
1205
1250
  extra_request_body=extra_request_body,
1206
1251
  profile=args.profile,
@@ -1220,6 +1265,7 @@ def run_benchmark(args_: argparse.Namespace):
1220
1265
  tokenizer=tokenizer,
1221
1266
  input_requests=input_requests,
1222
1267
  request_rate=rate,
1268
+ max_concurrency=args.max_concurrency,
1223
1269
  disable_tqdm=args.disable_tqdm,
1224
1270
  extra_request_body=extra_request_body,
1225
1271
  profile=args.profile,
@@ -1319,6 +1365,19 @@ if __name__ == "__main__":
1319
1365
  help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
1320
1366
  "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
1321
1367
  )
1368
+ parser.add_argument(
1369
+ "--max-concurrency",
1370
+ type=int,
1371
+ default=None,
1372
+ help="Maximum number of concurrent requests. This can be used "
1373
+ "to help simulate an environment where a higher level component "
1374
+ "is enforcing a maximum number of concurrent requests. While the "
1375
+ "--request-rate argument controls the rate at which requests are "
1376
+ "initiated, this argument will control how many are actually allowed "
1377
+ "to execute at a time. This means that when used in combination, the "
1378
+ "actual request rate may be lower than specified with --request-rate, "
1379
+ "if the server is not processing requests fast enough to keep up.",
1380
+ )
1322
1381
  parser.add_argument("--seed", type=int, default=1, help="The random seed.")
1323
1382
  parser.add_argument(
1324
1383
  "--multi",
@@ -1386,16 +1445,6 @@ if __name__ == "__main__":
1386
1445
  default=256,
1387
1446
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
1388
1447
  )
1389
- parser.add_argument(
1390
- "--generated-input-save-path",
1391
- type=str,
1392
- help="Path to save generated input data",
1393
- )
1394
- parser.add_argument(
1395
- "--generated-input-path",
1396
- type=str,
1397
- help="Path to load previously generated input data",
1398
- )
1399
1448
  parser.add_argument(
1400
1449
  "--profile",
1401
1450
  action="store_true",
@@ -78,5 +78,5 @@ class BaseBackend:
78
78
  def flush_cache(self):
79
79
  pass
80
80
 
81
- def get_server_args(self):
81
+ def get_server_info(self):
82
82
  pass
@@ -58,9 +58,9 @@ class RuntimeEndpoint(BaseBackend):
58
58
  )
59
59
  self._assert_success(res)
60
60
 
61
- def get_server_args(self):
61
+ def get_server_info(self):
62
62
  res = http_request(
63
- self.base_url + "/get_server_args",
63
+ self.base_url + "/get_server_info",
64
64
  api_key=self.api_key,
65
65
  verify=self.verify,
66
66
  )
@@ -1,17 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
15
14
 
16
15
  import json
17
16
  import logging
@@ -0,0 +1,16 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
15
+ # TODO(lmzheng): make this an optional dependency
16
+ from sglang.srt.constrained.outlines_backend import build_regex_from_object
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """The baseclass of a backend for grammar-guided constrained decoding."""
17
15
 
18
16
  from concurrent.futures import Future, ThreadPoolExecutor
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Constrained decoding with outlines backend."""
17
15
 
18
16
  import json