sglang 0.3.5__tar.gz → 0.3.5.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. {sglang-0.3.5 → sglang-0.3.5.post1}/PKG-INFO +11 -7
  2. {sglang-0.3.5 → sglang-0.3.5.post1}/README.md +8 -5
  3. {sglang-0.3.5 → sglang-0.3.5.post1}/pyproject.toml +4 -3
  4. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/bench_serving.py +113 -3
  5. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/configs/model_config.py +5 -2
  6. sglang-0.3.5.post1/sglang/srt/constrained/__init__.py +17 -0
  7. sglang-0.3.5.post1/sglang/srt/constrained/base_grammar_backend.py +72 -0
  8. sglang-0.3.5.post1/sglang/srt/constrained/outlines_backend.py +165 -0
  9. sglang-0.3.5.post1/sglang/srt/constrained/outlines_jump_forward.py +182 -0
  10. sglang-0.3.5.post1/sglang/srt/constrained/xgrammar_backend.py +114 -0
  11. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +7 -0
  12. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
  13. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/fused_moe/fused_moe.py +23 -7
  14. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/quantization/base_config.py +4 -6
  15. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/vocab_parallel_embedding.py +216 -150
  16. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/io_struct.py +5 -3
  17. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/schedule_batch.py +14 -20
  18. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/scheduler.py +153 -94
  19. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/tokenizer_manager.py +81 -17
  20. sglang-0.3.5.post1/sglang/srt/metrics/collector.py +211 -0
  21. sglang-0.3.5.post1/sglang/srt/metrics/func_timer.py +108 -0
  22. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mm_utils.py +1 -1
  23. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/model_executor/cuda_graph_runner.py +2 -2
  24. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/model_executor/forward_batch_info.py +7 -3
  25. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/model_executor/model_runner.py +2 -1
  26. sglang-0.3.5.post1/sglang/srt/models/gemma2_reward.py +69 -0
  27. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/gpt2.py +31 -37
  28. sglang-0.3.5.post1/sglang/srt/models/internlm2_reward.py +62 -0
  29. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llama.py +11 -6
  30. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llama_reward.py +5 -26
  31. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/olmo.py +0 -0
  32. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/qwen2_vl.py +5 -7
  33. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/openai_api/adapter.py +6 -2
  34. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/sampling_batch_info.py +2 -3
  35. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/sampling_params.py +0 -14
  36. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/server.py +58 -16
  37. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/server_args.py +42 -22
  38. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/utils.py +87 -0
  39. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_common.py +1 -1
  40. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_humaneval.py +2 -2
  41. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_mgsm.py +2 -2
  42. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/test_utils.py +18 -4
  43. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/utils.py +1 -0
  44. sglang-0.3.5.post1/sglang/version.py +1 -0
  45. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/PKG-INFO +11 -7
  46. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/SOURCES.txt +8 -5
  47. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/requires.txt +2 -1
  48. sglang-0.3.5/sglang/srt/constrained/__init__.py +0 -81
  49. sglang-0.3.5/sglang/srt/constrained/base_tool_cache.py +0 -65
  50. sglang-0.3.5/sglang/srt/constrained/bnf_cache.py +0 -61
  51. sglang-0.3.5/sglang/srt/constrained/fsm_cache.py +0 -95
  52. sglang-0.3.5/sglang/srt/constrained/grammar.py +0 -190
  53. sglang-0.3.5/sglang/srt/constrained/jump_forward.py +0 -203
  54. sglang-0.3.5/sglang/version.py +0 -1
  55. {sglang-0.3.5 → sglang-0.3.5.post1}/LICENSE +0 -0
  56. {sglang-0.3.5 → sglang-0.3.5.post1}/setup.cfg +0 -0
  57. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/__init__.py +0 -0
  58. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/api.py +0 -0
  59. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/bench_latency.py +0 -0
  60. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/bench_server_latency.py +0 -0
  61. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/check_env.py +0 -0
  62. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/global_config.py +0 -0
  63. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/__init__.py +0 -0
  64. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/__init__.py +0 -0
  65. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/anthropic.py +0 -0
  66. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/base_backend.py +0 -0
  67. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/litellm.py +0 -0
  68. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/openai.py +0 -0
  69. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  70. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/vertexai.py +0 -0
  71. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/chat_template.py +0 -0
  72. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/choices.py +0 -0
  73. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/compiler.py +0 -0
  74. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/interpreter.py +0 -0
  75. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/ir.py +0 -0
  76. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/tracer.py +0 -0
  77. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/launch_server.py +0 -0
  78. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/launch_server_llavavid.py +0 -0
  79. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/configs/__init__.py +0 -0
  80. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/configs/exaone.py +0 -0
  81. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/configs/qwen2vl.py +0 -0
  82. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/conversation.py +0 -0
  83. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  84. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/activation.py +0 -0
  85. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/__init__.py +0 -0
  86. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  87. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  88. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  89. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  90. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  91. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  92. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/fused_moe/layer.py +0 -0
  93. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/fused_moe/patch.py +0 -0
  94. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/layernorm.py +0 -0
  95. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/linear.py +0 -0
  96. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/logits_processor.py +0 -0
  97. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/pooler.py +0 -0
  98. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
  99. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/radix_attention.py +0 -0
  100. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/rotary_embedding.py +0 -0
  101. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/sampler.py +0 -0
  102. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  103. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/lora/lora.py +0 -0
  104. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/lora/lora_config.py +0 -0
  105. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/lora/lora_manager.py +0 -0
  106. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  107. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  108. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/image_processor.py +0 -0
  109. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  110. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/tp_worker.py +0 -0
  111. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  112. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  113. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  114. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  115. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
  116. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  117. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/baichuan.py +0 -0
  118. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/chatglm.py +0 -0
  119. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/commandr.py +0 -0
  120. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/dbrx.py +0 -0
  121. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/deepseek.py +0 -0
  122. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/deepseek_v2.py +0 -0
  123. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/exaone.py +0 -0
  124. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/gemma.py +0 -0
  125. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/gemma2.py +0 -0
  126. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  127. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/grok.py +0 -0
  128. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/internlm2.py +0 -0
  129. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llama_classification.py +0 -0
  130. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llama_embedding.py +0 -0
  131. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llava.py +0 -0
  132. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llavavid.py +0 -0
  133. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/minicpm.py +0 -0
  134. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/minicpm3.py +0 -0
  135. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/mistral.py +0 -0
  136. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/mixtral.py +0 -0
  137. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  138. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/mllama.py +0 -0
  139. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/olmoe.py +0 -0
  140. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/qwen.py +0 -0
  141. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/qwen2.py +0 -0
  142. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  143. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/stablelm.py +0 -0
  144. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  145. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/xverse.py +0 -0
  146. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/xverse_moe.py +0 -0
  147. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/yivl.py +0 -0
  148. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/openai_api/protocol.py +0 -0
  149. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  150. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  151. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  152. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  153. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  154. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  155. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  156. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  157. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/run_eval.py +0 -0
  158. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/runners.py +0 -0
  159. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  160. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_math.py +0 -0
  161. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  162. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  163. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/test_activation.py +0 -0
  164. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/test_layernorm.py +0 -0
  165. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/test_programs.py +0 -0
  166. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/dependency_links.txt +0 -0
  167. {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.5
3
+ Version: 0.3.5.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -225,13 +225,14 @@ Requires-Dist: interegular; extra == "runtime-common"
225
225
  Requires-Dist: orjson; extra == "runtime-common"
226
226
  Requires-Dist: packaging; extra == "runtime-common"
227
227
  Requires-Dist: pillow; extra == "runtime-common"
228
+ Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
228
229
  Requires-Dist: psutil; extra == "runtime-common"
229
230
  Requires-Dist: pydantic; extra == "runtime-common"
230
231
  Requires-Dist: python-multipart; extra == "runtime-common"
231
232
  Requires-Dist: torchao; extra == "runtime-common"
232
233
  Requires-Dist: uvicorn; extra == "runtime-common"
233
234
  Requires-Dist: uvloop; extra == "runtime-common"
234
- Requires-Dist: zmq; extra == "runtime-common"
235
+ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
235
236
  Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
236
237
  Requires-Dist: modelscope; extra == "runtime-common"
237
238
  Provides-Extra: srt
@@ -291,13 +292,14 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
291
292
  [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
292
293
  [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
293
294
  [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
295
+ [![](https://img.shields.io/badge/Gurubase-(experimental)-006BFF)](https://gurubase.io/g/sglang)
294
296
 
295
297
  </div>
296
298
 
297
299
  --------------------------------------------------------------------------------
298
300
 
299
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
300
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
301
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
302
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
301
303
 
302
304
  ## News
303
305
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
@@ -321,11 +323,13 @@ The core features include:
321
323
 
322
324
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
323
325
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
324
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
326
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
325
327
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
326
328
 
327
- ## Install
328
- See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
329
+ ## Getting Started
330
+ Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
331
+
332
+ Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
329
333
 
330
334
  ## Backend: SGLang Runtime (SRT)
331
335
  See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
@@ -6,13 +6,14 @@
6
6
  [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
7
7
  [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
8
8
  [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
9
+ [![](https://img.shields.io/badge/Gurubase-(experimental)-006BFF)](https://gurubase.io/g/sglang)
9
10
 
10
11
  </div>
11
12
 
12
13
  --------------------------------------------------------------------------------
13
14
 
14
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
15
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
15
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
16
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
16
17
 
17
18
  ## News
18
19
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
@@ -36,11 +37,13 @@ The core features include:
36
37
 
37
38
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
38
39
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
39
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
40
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
40
41
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
41
42
 
42
- ## Install
43
- See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
43
+ ## Getting Started
44
+ Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
45
+
46
+ Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
44
47
 
45
48
  ## Backend: SGLang Runtime (SRT)
46
49
  See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.5"
7
+ version = "0.3.5.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -17,10 +17,11 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
17
17
 
18
18
  [project.optional-dependencies]
19
19
  runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
20
- "orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
21
- "torchao", "uvicorn", "uvloop", "zmq",
20
+ "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
21
+ "torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
22
22
  "outlines>=0.0.44", "modelscope"]
23
23
  srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
24
+
24
25
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
25
26
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
26
27
  srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
@@ -596,12 +596,20 @@ def sample_random_requests(
596
596
 
597
597
  # Filter out sequences that are too long or too short
598
598
  input_requests: List[Tuple[str, int, int]] = []
599
- for i in range(num_prompts):
599
+ for data in dataset:
600
+ i = len(input_requests)
601
+ if i == num_prompts:
602
+ break
603
+
600
604
  # Tokenize the prompts and completions.
601
- prompt = dataset[i][0]
605
+ prompt = data[0]
602
606
  prompt_token_ids = tokenizer.encode(prompt)
603
607
  prompt_len = len(prompt_token_ids)
604
608
 
609
+ # Skip empty prompt
610
+ if prompt_len == 0:
611
+ continue
612
+
605
613
  if prompt_len > input_lens[i]:
606
614
  input_ids = prompt_token_ids[: input_lens[i]]
607
615
  else:
@@ -627,6 +635,66 @@ def sample_random_requests(
627
635
  return input_requests
628
636
 
629
637
 
638
+ def gen_prompt(tokenizer, token_num):
639
+ """Generate a random prompt of specified token length using tokenizer vocabulary."""
640
+ all_available_tokens = list(tokenizer.get_vocab().values())
641
+ selected_tokens = random.choices(all_available_tokens, k=token_num)
642
+ return tokenizer.decode(selected_tokens)
643
+
644
+
645
+ def sample_generated_shared_prefix_requests(
646
+ num_groups: int,
647
+ prompts_per_group: int,
648
+ system_prompt_len: int,
649
+ question_len: int,
650
+ output_len: int,
651
+ tokenizer: PreTrainedTokenizerBase,
652
+ ) -> List[Tuple[str, int, int]]:
653
+ """Generate benchmark requests with shared system prompts using random tokens."""
654
+ # Generate system prompts for each group
655
+ system_prompts = []
656
+ for _ in range(num_groups):
657
+ system_prompt = gen_prompt(tokenizer, system_prompt_len)
658
+ system_prompts.append(system_prompt)
659
+
660
+ # Generate questions
661
+ questions = []
662
+ for _ in range(num_groups * prompts_per_group):
663
+ question = gen_prompt(tokenizer, question_len)
664
+ questions.append(question)
665
+
666
+ # Combine system prompts with questions
667
+ input_requests = []
668
+ total_input_tokens = 0
669
+ total_output_tokens = 0
670
+
671
+ for group_idx in range(num_groups):
672
+ system_prompt = system_prompts[group_idx]
673
+ for prompt_idx in range(prompts_per_group):
674
+ question = questions[group_idx * prompts_per_group + prompt_idx]
675
+ full_prompt = f"{system_prompt}\n\n{question}"
676
+ prompt_len = len(tokenizer.encode(full_prompt))
677
+
678
+ input_requests.append((full_prompt, prompt_len, output_len))
679
+ total_input_tokens += prompt_len
680
+ total_output_tokens += output_len
681
+
682
+ print(f"\nGenerated shared prefix dataset statistics:")
683
+ print(f"Number of groups: {num_groups}")
684
+ print(f"Prompts per group: {prompts_per_group}")
685
+ print(f"Total prompts: {len(input_requests)}")
686
+ print(f"Total input tokens: {total_input_tokens}")
687
+ print(f"Total output tokens: {total_output_tokens}")
688
+ print(
689
+ f"Average system prompt length: {sum(len(tokenizer.encode(sp)) for sp in system_prompts) / len(system_prompts):.1f} tokens"
690
+ )
691
+ print(
692
+ f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
693
+ )
694
+
695
+ return input_requests
696
+
697
+
630
698
  async def get_request(
631
699
  input_requests: List[Tuple[str, int, int]],
632
700
  request_rate: float,
@@ -1048,6 +1116,15 @@ def run_benchmark(args_: argparse.Namespace):
1048
1116
  tokenizer=tokenizer,
1049
1117
  dataset_path=args.dataset_path,
1050
1118
  )
1119
+ elif args.dataset_name == "generated-shared-prefix":
1120
+ input_requests = sample_generated_shared_prefix_requests(
1121
+ num_groups=args.gen_num_groups,
1122
+ prompts_per_group=args.gen_prompts_per_group,
1123
+ system_prompt_len=args.gen_system_prompt_len,
1124
+ question_len=args.gen_question_len,
1125
+ output_len=args.gen_output_len,
1126
+ tokenizer=tokenizer,
1127
+ )
1051
1128
  else:
1052
1129
  raise ValueError(f"Unknown dataset: {args.dataset_name}")
1053
1130
 
@@ -1121,7 +1198,7 @@ if __name__ == "__main__":
1121
1198
  "--dataset-name",
1122
1199
  type=str,
1123
1200
  default="sharegpt",
1124
- choices=["sharegpt", "random"],
1201
+ choices=["sharegpt", "random", "generated-shared-prefix"],
1125
1202
  help="Name of the dataset to benchmark on.",
1126
1203
  )
1127
1204
  parser.add_argument(
@@ -1208,5 +1285,38 @@ if __name__ == "__main__":
1208
1285
  help="Append given JSON object to the request payload. You can use this to specify"
1209
1286
  "additional generate params like sampling params.",
1210
1287
  )
1288
+
1289
+ group = parser.add_argument_group("generated-shared-prefix dataset arguments")
1290
+ group.add_argument(
1291
+ "--gen-num-groups",
1292
+ type=int,
1293
+ default=64,
1294
+ help="Number of system prompt groups for generated-shared-prefix dataset",
1295
+ )
1296
+ group.add_argument(
1297
+ "--gen-prompts-per-group",
1298
+ type=int,
1299
+ default=16,
1300
+ help="Number of prompts per system prompt group for generated-shared-prefix dataset",
1301
+ )
1302
+ group.add_argument(
1303
+ "--gen-system-prompt-len",
1304
+ type=int,
1305
+ default=2048,
1306
+ help="Target length in tokens for system prompts in generated-shared-prefix dataset",
1307
+ )
1308
+ group.add_argument(
1309
+ "--gen-question-len",
1310
+ type=int,
1311
+ default=128,
1312
+ help="Target length in tokens for questions in generated-shared-prefix dataset",
1313
+ )
1314
+ group.add_argument(
1315
+ "--gen-output-len",
1316
+ type=int,
1317
+ default=256,
1318
+ help="Target length in tokens for outputs in generated-shared-prefix dataset",
1319
+ )
1320
+
1211
1321
  args = parser.parse_args()
1212
1322
  run_benchmark(args)
@@ -39,7 +39,7 @@ class ModelConfig:
39
39
  revision: Optional[str] = None,
40
40
  context_length: Optional[int] = None,
41
41
  model_override_args: Optional[dict] = None,
42
- is_embedding: Optional[bool] = None
42
+ is_embedding: Optional[bool] = None,
43
43
  ) -> None:
44
44
  # Parse args
45
45
  self.model_override_args = json.loads(model_override_args)
@@ -52,7 +52,9 @@ class ModelConfig:
52
52
  self.hf_text_config = get_hf_text_config(self.hf_config)
53
53
 
54
54
  # Check model type
55
- self.is_generation = is_generation_model(self.hf_config.architectures, is_embedding)
55
+ self.is_generation = is_generation_model(
56
+ self.hf_config.architectures, is_embedding
57
+ )
56
58
  self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
57
59
  self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
58
60
 
@@ -208,6 +210,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
208
210
  or "MistralModel" in model_architectures
209
211
  or "LlamaForSequenceClassification" in model_architectures
210
212
  or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
213
+ or "InternLM2ForRewardModel" in model_architectures
211
214
  ):
212
215
  return False
213
216
  else:
@@ -0,0 +1,17 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
16
+ # TODO(lmzheng): make this an optional dependency
17
+ from sglang.srt.constrained.outlines_backend import build_regex_from_object
@@ -0,0 +1,72 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
16
+ """The baseclass of backends for grammar-guided constrained decoding."""
17
+
18
+ from concurrent.futures import Future, ThreadPoolExecutor
19
+ from dataclasses import dataclass
20
+ from threading import Event, Lock
21
+ from typing import Any, Optional, Tuple
22
+
23
+
24
+ @dataclass
25
+ class CacheEntry:
26
+ value: Any
27
+ event: Event
28
+
29
+
30
+ class BaseGrammarObject:
31
+ pass
32
+
33
+
34
+ class BaseGrammarBackend:
35
+ def __init__(self):
36
+ self.executor = ThreadPoolExecutor()
37
+ self.cache = {}
38
+ self.cache_lock = Lock()
39
+
40
+ def init_value(self, key: Tuple[str, str]) -> BaseGrammarObject:
41
+ with self.cache_lock:
42
+ if key in self.cache:
43
+ cache_hit = True
44
+ entry = self.cache[key]
45
+ else:
46
+ cache_hit = False
47
+ entry = CacheEntry(None, Event())
48
+ self.cache[key] = entry
49
+
50
+ if cache_hit:
51
+ entry.event.wait()
52
+ else:
53
+ entry.value = self.init_value_impl(key)
54
+ entry.event.set()
55
+ return entry.value.copy()
56
+
57
+ def init_value_impl(self, key: Tuple[str, str]) -> BaseGrammarObject:
58
+ raise NotImplementedError()
59
+
60
+ def get_cached_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
61
+ with self.cache_lock:
62
+ entry = self.cache.get(key)
63
+ if not entry or not entry.event.is_set():
64
+ return None
65
+ return self.cache[key].value.copy()
66
+
67
+ def get_future_value(self, key: Tuple[str, str]) -> Future:
68
+ return self.executor.submit(self.init_value, key)
69
+
70
+ def reset(self):
71
+ with self.cache_lock:
72
+ self.cache.clear()
@@ -0,0 +1,165 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
16
+ """Constrained decoding with outlines backend."""
17
+
18
+ import json
19
+ import logging
20
+ from typing import Dict, List, Optional, Tuple, Union
21
+
22
+ import torch
23
+ from outlines.fsm.guide import RegexGuide
24
+ from outlines.models.transformers import TransformerTokenizer
25
+
26
+ from sglang.srt.constrained.base_grammar_backend import (
27
+ BaseGrammarBackend,
28
+ BaseGrammarObject,
29
+ )
30
+ from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ try:
36
+ from outlines.fsm.json_schema import build_regex_from_object
37
+ except ImportError:
38
+ # Since outlines 0.0.32, build_regex_from_object is replaced by build_regex_from_schema,
39
+ # which only accepts string schema as input.
40
+ from outlines.fsm.json_schema import build_regex_from_schema
41
+ from pydantic import BaseModel
42
+
43
+ def build_regex_from_object(
44
+ object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
45
+ ):
46
+ if isinstance(object, type(BaseModel)):
47
+ schema = json.dumps(object.model_json_schema())
48
+ elif isinstance(object, Dict):
49
+ schema = json.dumps(object)
50
+ else:
51
+ schema = object
52
+ return build_regex_from_schema(schema, whitespace_pattern)
53
+
54
+
55
+ class OutlinesGrammar(BaseGrammarObject):
56
+ def __init__(
57
+ self,
58
+ guide: RegexGuide,
59
+ jump_forward_map: Union[OutlinesJumpForwardMap, None],
60
+ ) -> None:
61
+ self.guide = guide
62
+ self.jump_forward_map = jump_forward_map
63
+ self.state = 0
64
+
65
+ def accept_token(self, token: int):
66
+ self.state = self.guide.get_next_state(self.state, token)
67
+
68
+ def try_jump_forward(self, tokenizer) -> Optional[Tuple]:
69
+ if not self.jump_forward_map:
70
+ return None
71
+
72
+ jump_forward_bytes = self.jump_forward_map.jump_forward_byte(self.state)
73
+ if jump_forward_bytes is None or len(jump_forward_bytes) <= 1:
74
+ return None
75
+
76
+ # preprocess the jump forward string
77
+ suffix_bytes = []
78
+ continuation_range = range(0x80, 0xC0)
79
+ cur_state = self.state
80
+ while (
81
+ len(jump_forward_bytes) and jump_forward_bytes[0][0] in continuation_range
82
+ ):
83
+ # continuation bytes
84
+ byte_edge = jump_forward_bytes.pop(0)
85
+ suffix_bytes.append(byte_edge[0])
86
+ cur_state = byte_edge[1]
87
+
88
+ suffix_tokens = [f"<0x{hex(b)[2:].upper()}>" for b in suffix_bytes]
89
+ suffix_ids = tokenizer.convert_tokens_to_ids(suffix_tokens)
90
+ return suffix_ids, cur_state
91
+
92
+ def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
93
+ _, cur_state = helper
94
+ return self.jump_forward_map.jump_forward_symbol(cur_state)
95
+
96
+ def jump_and_retokenize(
97
+ self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
98
+ ):
99
+ self.state = next_state
100
+
101
+ def fill_vocab_mask(self, vocab_mask: torch.Tensor):
102
+ vocab_mask.fill_(1)
103
+ vocab_mask[self.guide.get_next_instruction(self.state).tokens] = 0
104
+
105
+ def copy(self):
106
+ return OutlinesGrammar(self.guide, self.jump_forward_map)
107
+
108
+
109
+ class OutlinesGrammarBackend(BaseGrammarBackend):
110
+ def __init__(
111
+ self,
112
+ tokenizer,
113
+ whitespace_pattern: bool,
114
+ allow_jump_forward: bool,
115
+ ):
116
+ super().__init__()
117
+
118
+ try:
119
+ self.outlines_tokenizer = TransformerTokenizer(tokenizer)
120
+ except AttributeError:
121
+ # FIXME: tmp fix for chatglm2 & chatglm3 (pad_token_id=0)
122
+ origin_pad_token_id = tokenizer.pad_token_id
123
+
124
+ def fset(self, value):
125
+ self._value = value
126
+
127
+ type(tokenizer).pad_token_id = property(
128
+ fget=type(tokenizer).pad_token_id.fget, fset=fset
129
+ )
130
+ self.outlines_tokenizer = TransformerTokenizer(tokenizer)
131
+ self.outlines_tokenizer.tokenizer.pad_token_id = origin_pad_token_id
132
+ self.outlines_tokenizer.pad_token_id = origin_pad_token_id
133
+ self.outlines_tokenizer.pad_token = (
134
+ self.outlines_tokenizer.tokenizer.pad_token
135
+ )
136
+ self.outlines_tokenizer.vocabulary = (
137
+ self.outlines_tokenizer.tokenizer.get_vocab()
138
+ )
139
+ self.allow_jump_forward = allow_jump_forward
140
+ self.whitespace_pattern = whitespace_pattern
141
+
142
+ def init_value_impl(self, key: Tuple[str, str]) -> OutlinesGrammar:
143
+ key_type, key_string = key
144
+ if key_type == "json":
145
+ try:
146
+ regex = build_regex_from_object(
147
+ key_string,
148
+ whitespace_pattern=self.whitespace_pattern,
149
+ )
150
+ except NotImplementedError as e:
151
+ logger.warning(
152
+ f"skip invalid json schema: json_schema={key_string}, {e=}"
153
+ )
154
+ return None, key_string
155
+ elif key_type == "regex":
156
+ regex = key_string
157
+ else:
158
+ raise ValueError(f"Invalid key_type: {key_type}")
159
+
160
+ guide = RegexGuide(regex, self.outlines_tokenizer)
161
+ if self.allow_jump_forward:
162
+ jump_forward_map = OutlinesJumpForwardMap(regex)
163
+ else:
164
+ jump_forward_map = None
165
+ return OutlinesGrammar(guide, jump_forward_map)