sglang 0.3.3__tar.gz → 0.3.3.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. {sglang-0.3.3/sglang.egg-info → sglang-0.3.3.post1}/PKG-INFO +13 -6
  2. {sglang-0.3.3 → sglang-0.3.3.post1}/README.md +12 -5
  3. {sglang-0.3.3 → sglang-0.3.3.post1}/pyproject.toml +1 -1
  4. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/bench_latency.py +3 -3
  5. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/conversation.py +11 -2
  6. sglang-0.3.3.post1/sglang/srt/managers/data_parallel_controller.py +177 -0
  7. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/io_struct.py +7 -2
  8. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/schedule_batch.py +6 -0
  9. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/scheduler.py +46 -5
  10. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/tokenizer_manager.py +9 -0
  11. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/model_executor/model_runner.py +40 -35
  12. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/openai_api/adapter.py +5 -3
  13. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/sampling_batch_info.py +19 -7
  14. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/server.py +55 -20
  15. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/server_args.py +14 -11
  16. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/utils.py +26 -11
  17. sglang-0.3.3.post1/sglang/version.py +1 -0
  18. {sglang-0.3.3 → sglang-0.3.3.post1/sglang.egg-info}/PKG-INFO +13 -6
  19. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang.egg-info/SOURCES.txt +1 -0
  20. sglang-0.3.3/sglang/version.py +0 -1
  21. {sglang-0.3.3 → sglang-0.3.3.post1}/LICENSE +0 -0
  22. {sglang-0.3.3 → sglang-0.3.3.post1}/setup.cfg +0 -0
  23. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/__init__.py +0 -0
  24. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/api.py +0 -0
  25. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/bench_server_latency.py +0 -0
  26. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/bench_serving.py +0 -0
  27. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/check_env.py +0 -0
  28. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/global_config.py +0 -0
  29. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/__init__.py +0 -0
  30. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/__init__.py +0 -0
  31. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/anthropic.py +0 -0
  32. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/base_backend.py +0 -0
  33. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/litellm.py +0 -0
  34. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/openai.py +0 -0
  35. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  36. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/vertexai.py +0 -0
  37. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/chat_template.py +0 -0
  38. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/choices.py +0 -0
  39. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/compiler.py +0 -0
  40. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/interpreter.py +0 -0
  41. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/ir.py +0 -0
  42. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/tracer.py +0 -0
  43. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/launch_server.py +0 -0
  44. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/launch_server_llavavid.py +0 -0
  45. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/configs/__init__.py +0 -0
  46. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/configs/exaone.py +0 -0
  47. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/configs/model_config.py +0 -0
  48. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/constrained/__init__.py +0 -0
  49. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/constrained/base_tool_cache.py +0 -0
  50. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/constrained/fsm_cache.py +0 -0
  51. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/constrained/jump_forward.py +0 -0
  52. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  53. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/activation.py +0 -0
  54. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/__init__.py +0 -0
  55. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  56. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/flashinfer_utils.py +0 -0
  57. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  58. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  59. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  60. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  61. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  62. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  63. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/layer.py +0 -0
  64. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/patch.py +0 -0
  65. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/layernorm.py +0 -0
  66. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/linear.py +0 -0
  67. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/logits_processor.py +0 -0
  68. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/pooler.py +0 -0
  69. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
  70. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  71. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/radix_attention.py +0 -0
  72. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/sampler.py +0 -0
  73. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  74. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/lora/lora.py +0 -0
  75. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/lora/lora_config.py +0 -0
  76. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/lora/lora_manager.py +0 -0
  77. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  78. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/image_processor.py +0 -0
  79. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  80. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/tp_worker.py +0 -0
  81. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  82. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  83. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  84. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
  85. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  86. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mm_utils.py +0 -0
  87. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
  88. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  89. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/baichuan.py +0 -0
  90. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/chatglm.py +0 -0
  91. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/commandr.py +0 -0
  92. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/dbrx.py +0 -0
  93. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/deepseek.py +0 -0
  94. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/deepseek_v2.py +0 -0
  95. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/exaone.py +0 -0
  96. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/gemma.py +0 -0
  97. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/gemma2.py +0 -0
  98. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  99. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/grok.py +0 -0
  100. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/internlm2.py +0 -0
  101. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llama.py +0 -0
  102. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llama_classification.py +0 -0
  103. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llama_embedding.py +0 -0
  104. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llama_reward.py +0 -0
  105. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llava.py +0 -0
  106. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llavavid.py +0 -0
  107. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/minicpm.py +0 -0
  108. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/minicpm3.py +0 -0
  109. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/mistral.py +0 -0
  110. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/mixtral.py +0 -0
  111. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  112. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/olmoe.py +0 -0
  113. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/qwen.py +0 -0
  114. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/qwen2.py +0 -0
  115. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  116. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/stablelm.py +0 -0
  117. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  118. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/xverse.py +0 -0
  119. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/xverse_moe.py +0 -0
  120. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/yivl.py +0 -0
  121. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/openai_api/protocol.py +0 -0
  122. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  123. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  124. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  125. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  126. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  127. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  128. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  129. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  130. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/run_eval.py +0 -0
  131. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/runners.py +0 -0
  132. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_common.py +0 -0
  133. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  134. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  135. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_math.py +0 -0
  136. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  137. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  138. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  139. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/test_activation.py +0 -0
  140. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/test_layernorm.py +0 -0
  141. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/test_programs.py +0 -0
  142. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/test_utils.py +0 -0
  143. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/utils.py +0 -0
  144. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang.egg-info/dependency_links.txt +0 -0
  145. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang.egg-info/requires.txt +0 -0
  146. {sglang-0.3.3 → sglang-0.3.3.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.3
3
+ Version: 0.3.3.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -257,8 +257,8 @@ Provides-Extra: dev
257
257
  Requires-Dist: sglang[all]; extra == "dev"
258
258
  Requires-Dist: sglang[test]; extra == "dev"
259
259
 
260
- <div align="center">
261
- <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
260
+ <div align="center" id="sglangtop">
261
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
262
262
 
263
263
  [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
264
264
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -270,10 +270,9 @@ Requires-Dist: sglang[test]; extra == "dev"
270
270
 
271
271
  --------------------------------------------------------------------------------
272
272
 
273
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
273
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
274
274
 
275
275
  ## Upcoming Events
276
- - [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
277
276
  - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
278
277
 
279
278
  ## News
@@ -324,7 +323,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
324
323
  ### Method 2: From source
325
324
  ```
326
325
  # Use the last release branch
327
- git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
326
+ git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
328
327
  cd sglang
329
328
 
330
329
  pip install --upgrade pip
@@ -848,3 +847,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
848
847
  ## Citation And Acknowledgment
849
848
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
850
849
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
850
+
851
+
852
+
853
+ <p align="center">
854
+ <a href="#sglangtop" target="_blank">
855
+ <bold>Back To Top </bold>
856
+ </a>
857
+ </p>
@@ -1,5 +1,5 @@
1
- <div align="center">
2
- <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
1
+ <div align="center" id="sglangtop">
2
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
3
3
 
4
4
  [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
5
5
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -11,10 +11,9 @@
11
11
 
12
12
  --------------------------------------------------------------------------------
13
13
 
14
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
14
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
15
15
 
16
16
  ## Upcoming Events
17
- - [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
18
17
  - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
19
18
 
20
19
  ## News
@@ -65,7 +64,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
65
64
  ### Method 2: From source
66
65
  ```
67
66
  # Use the last release branch
68
- git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
67
+ git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
69
68
  cd sglang
70
69
 
71
70
  pip install --upgrade pip
@@ -589,3 +588,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
589
588
  ## Citation And Acknowledgment
590
589
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
591
590
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
591
+
592
+
593
+
594
+ <p align="center">
595
+ <a href="#sglangtop" target="_blank">
596
+ <bold>Back To Top </bold>
597
+ </a>
598
+ </p>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.3"
7
+ version = "0.3.3.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -139,7 +139,7 @@ def load_model(server_args, port_args, tp_rank):
139
139
  gpu_id=tp_rank,
140
140
  tp_rank=tp_rank,
141
141
  tp_size=server_args.tp_size,
142
- nccl_port=port_args.nccl_ports[0],
142
+ nccl_port=port_args.nccl_port,
143
143
  server_args=server_args,
144
144
  )
145
145
  rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
@@ -220,6 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
220
220
  return reqs
221
221
 
222
222
 
223
+ @torch.inference_mode()
223
224
  def extend(reqs, model_runner):
224
225
  batch = ScheduleBatch.init_new(
225
226
  reqs=reqs,
@@ -235,6 +236,7 @@ def extend(reqs, model_runner):
235
236
  return next_token_ids, logits_output.next_token_logits, batch
236
237
 
237
238
 
239
+ @torch.inference_mode()
238
240
  def decode(input_token_ids, batch, model_runner):
239
241
  batch.prepare_for_decode(input_token_ids)
240
242
  model_worker_batch = batch.get_model_worker_batch()
@@ -244,7 +246,6 @@ def decode(input_token_ids, batch, model_runner):
244
246
  return next_token_ids, logits_output.next_token_logits
245
247
 
246
248
 
247
- @torch.inference_mode()
248
249
  def correctness_test(
249
250
  server_args,
250
251
  port_args,
@@ -287,7 +288,6 @@ def correctness_test(
287
288
  rank_print(tokenizer.decode(output_ids[i]), "\n")
288
289
 
289
290
 
290
- @torch.inference_mode()
291
291
  def latency_test_run_once(
292
292
  run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
293
293
  ):
@@ -70,6 +70,9 @@ class Conversation:
70
70
  sep2: str = None
71
71
  # Stop criteria (the default one is EOS token)
72
72
  stop_str: Union[str, List[str]] = None
73
+ # The string that represents an image token in the prompt
74
+ image_token: str = "<image>"
75
+
73
76
  image_data: Optional[List[str]] = None
74
77
  modalities: Optional[List[str]] = None
75
78
 
@@ -334,6 +337,7 @@ class Conversation:
334
337
  sep=self.sep,
335
338
  sep2=self.sep2,
336
339
  stop_str=self.stop_str,
340
+ image_token=self.image_token,
337
341
  )
338
342
 
339
343
  def dict(self):
@@ -381,6 +385,7 @@ def generate_chat_conv(
381
385
  stop_str=conv.stop_str,
382
386
  image_data=[],
383
387
  modalities=[],
388
+ image_token=conv.image_token,
384
389
  )
385
390
 
386
391
  if isinstance(request.messages, str):
@@ -412,9 +417,13 @@ def generate_chat_conv(
412
417
  num_image_url += 1
413
418
  conv.modalities.append(content.modalities)
414
419
  if num_image_url > 1:
415
- image_token = "<image>"
420
+ image_token = conv.image_token
416
421
  else:
417
- image_token = "<image>\n"
422
+ image_token = (
423
+ conv.image_token + "\n"
424
+ if conv.name != "qwen2-vl"
425
+ else conv.image_token
426
+ )
418
427
  for content in message.content:
419
428
  if content.type == "text":
420
429
  if num_image_url > 16:
@@ -0,0 +1,177 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
16
+ """A controller that dispatches requests to multiple data parallel workers."""
17
+
18
+ import logging
19
+ import multiprocessing as mp
20
+ from enum import Enum, auto
21
+
22
+ import zmq
23
+
24
+ from sglang.srt.managers.io_struct import (
25
+ TokenizedEmbeddingReqInput,
26
+ TokenizedGenerateReqInput,
27
+ TokenizedRewardReqInput,
28
+ )
29
+ from sglang.srt.managers.scheduler import run_scheduler_process
30
+ from sglang.srt.server_args import PortArgs, ServerArgs
31
+ from sglang.srt.utils import (
32
+ configure_logger,
33
+ kill_parent_process,
34
+ suppress_other_loggers,
35
+ )
36
+ from sglang.utils import get_exception_traceback
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ class LoadBalanceMethod(Enum):
42
+ """Load balance method."""
43
+
44
+ ROUND_ROBIN = auto()
45
+ SHORTEST_QUEUE = auto()
46
+
47
+ @classmethod
48
+ def from_str(cls, method: str):
49
+ method = method.upper()
50
+ try:
51
+ return cls[method]
52
+ except KeyError as exc:
53
+ raise ValueError(f"Invalid load balance method: {method}") from exc
54
+
55
+
56
+ class DataParallelController:
57
+ """A controller that dispatches requests to multiple data parallel workers."""
58
+
59
+ def __init__(self, server_args, port_args) -> None:
60
+ # Parse args
61
+ self.server_args = server_args
62
+ self.port_args = port_args
63
+ self.load_balance_method = LoadBalanceMethod.from_str(
64
+ server_args.load_balance_method
65
+ )
66
+
67
+ # Init inter-process communication
68
+ self.context = zmq.Context(1 + server_args.dp_size)
69
+ self.recv_from_tokenizer = self.context.socket(zmq.PULL)
70
+ self.recv_from_tokenizer.bind(f"ipc://{port_args.scheduler_input_ipc_name}")
71
+
72
+ # Dispatch method
73
+ self.round_robin_counter = 0
74
+ dispatch_lookup = {
75
+ LoadBalanceMethod.ROUND_ROBIN: self.round_robin_scheduler,
76
+ LoadBalanceMethod.SHORTEST_QUEUE: self.shortest_queue_scheduler,
77
+ }
78
+ self.dispatching = dispatch_lookup[self.load_balance_method]
79
+
80
+ # Start data parallel workers
81
+ base_gpu_id = 0
82
+ self.workers = []
83
+ for dp_rank in range(server_args.dp_size):
84
+ tmp_port_args = PortArgs.init_new(server_args)
85
+ tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
86
+
87
+ send_to = self.launch_tensor_parallel_group(
88
+ server_args,
89
+ tmp_port_args,
90
+ base_gpu_id,
91
+ dp_rank,
92
+ )
93
+
94
+ self.workers.append(send_to)
95
+ base_gpu_id += server_args.tp_size
96
+
97
+ def launch_tensor_parallel_group(
98
+ self,
99
+ server_args: ServerArgs,
100
+ port_args: PortArgs,
101
+ base_gpu_id: int,
102
+ dp_rank: int,
103
+ ):
104
+ # Launch tensor parallel scheduler processes
105
+ scheduler_procs = []
106
+ scheduler_pipe_readers = []
107
+ tp_size_per_node = server_args.tp_size // server_args.nnodes
108
+ tp_rank_range = range(
109
+ tp_size_per_node * server_args.node_rank,
110
+ tp_size_per_node * (server_args.node_rank + 1),
111
+ )
112
+ for tp_rank in tp_rank_range:
113
+ reader, writer = mp.Pipe(duplex=False)
114
+ gpu_id = base_gpu_id + tp_rank % tp_size_per_node
115
+ proc = mp.Process(
116
+ target=run_scheduler_process,
117
+ args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
118
+ )
119
+ proc.start()
120
+ scheduler_procs.append(proc)
121
+ scheduler_pipe_readers.append(reader)
122
+
123
+ send_to = self.context.socket(zmq.PUSH)
124
+ send_to.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
125
+
126
+ # Wait for model to finish loading
127
+ for i in range(len(scheduler_pipe_readers)):
128
+ scheduler_pipe_readers[i].recv()
129
+
130
+ return send_to
131
+
132
+ def round_robin_scheduler(self, req):
133
+ self.workers[self.round_robin_counter].send_pyobj(req)
134
+ self.round_robin_counter = (self.round_robin_counter + 1) % len(self.workers)
135
+
136
+ def shortest_queue_scheduler(self, input_requests):
137
+ raise NotImplementedError()
138
+
139
+ def event_loop(self):
140
+ while True:
141
+ while True:
142
+ try:
143
+ recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
144
+ except zmq.ZMQError:
145
+ break
146
+
147
+ if isinstance(
148
+ recv_req,
149
+ (
150
+ TokenizedGenerateReqInput,
151
+ TokenizedEmbeddingReqInput,
152
+ TokenizedRewardReqInput,
153
+ ),
154
+ ):
155
+ self.dispatching(recv_req)
156
+ else:
157
+ # Send other control messages to all workers
158
+ for worker in self.workers:
159
+ worker.queue.put(recv_req)
160
+
161
+
162
+ def run_data_parallel_controller_process(
163
+ server_args: ServerArgs,
164
+ port_args: PortArgs,
165
+ pipe_writer,
166
+ ):
167
+ configure_logger(server_args)
168
+ suppress_other_loggers()
169
+
170
+ try:
171
+ controller = DataParallelController(server_args, port_args)
172
+ pipe_writer.send("ready")
173
+ controller.event_loop()
174
+ except Exception:
175
+ msg = get_exception_traceback()
176
+ logger.error(msg)
177
+ kill_parent_process()
@@ -20,6 +20,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
20
20
 
21
21
  import uuid
22
22
  from dataclasses import dataclass
23
+ from enum import Enum
23
24
  from typing import Dict, List, Optional, Union
24
25
 
25
26
  from sglang.srt.managers.schedule_batch import BaseFinishReason
@@ -119,8 +120,7 @@ class GenerateReqInput:
119
120
  elif not isinstance(self.image_data, list):
120
121
  self.image_data = [self.image_data] * num
121
122
  elif isinstance(self.image_data, list):
122
- # FIXME incorrect order for duplication
123
- self.image_data = self.image_data * num
123
+ pass
124
124
 
125
125
  if self.sampling_params is None:
126
126
  self.sampling_params = [{}] * num
@@ -344,3 +344,8 @@ class UpdateWeightReqOutput:
344
344
  class AbortReq:
345
345
  # The request id
346
346
  rid: str
347
+
348
+
349
+ class ProfileReq(Enum):
350
+ START_PROFILE = 1
351
+ STOP_PROFILE = 2
@@ -423,6 +423,9 @@ class ScheduleBatch:
423
423
  # Stream
424
424
  has_stream: bool = False
425
425
 
426
+ # device
427
+ device: str = "cuda"
428
+
426
429
  # Has regex
427
430
  has_regex: bool = False
428
431
 
@@ -439,6 +442,7 @@ class ScheduleBatch:
439
442
  tree_cache=tree_cache,
440
443
  return_logprob=return_logprob,
441
444
  has_stream=has_stream,
445
+ device=req_to_token_pool.device,
442
446
  has_regex=has_regex,
443
447
  )
444
448
 
@@ -806,6 +810,8 @@ class ScheduleBatch:
806
810
  self.sampling_info.regex_fsm_states = [
807
811
  req.regex_fsm_state for req in self.reqs
808
812
  ]
813
+ else:
814
+ self.sampling_info.regex_fsms = None
809
815
 
810
816
  return ModelWorkerBatch(
811
817
  forward_mode=self.forward_mode,
@@ -37,6 +37,7 @@ from sglang.srt.managers.io_struct import (
37
37
  BatchEmbeddingOut,
38
38
  BatchTokenIDOut,
39
39
  FlushCacheReq,
40
+ ProfileReq,
40
41
  TokenizedEmbeddingReqInput,
41
42
  TokenizedGenerateReqInput,
42
43
  TokenizedRewardReqInput,
@@ -141,7 +142,7 @@ class Scheduler:
141
142
  gpu_id=gpu_id,
142
143
  tp_rank=tp_rank,
143
144
  server_args=server_args,
144
- nccl_port=port_args.nccl_ports[0],
145
+ nccl_port=port_args.nccl_port,
145
146
  )
146
147
  self.tp_cpu_group = self.tp_worker.model_runner.tp_group.cpu_group
147
148
 
@@ -229,6 +230,22 @@ class Scheduler:
229
230
  self.new_token_ratio_decay = global_config.new_token_ratio_decay
230
231
  self.batch_is_full = False
231
232
 
233
+ if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
234
+ self.profiler = None
235
+ else:
236
+ self.torch_profiler_trace_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
237
+ logger.info(
238
+ "Profiling enabled. Traces will be saved to: %s",
239
+ self.torch_profiler_trace_dir,
240
+ )
241
+ self.profiler = torch.profiler.profile(
242
+ activities=[
243
+ torch.profiler.ProfilerActivity.CPU,
244
+ torch.profiler.ProfilerActivity.CUDA,
245
+ ],
246
+ with_stack=True,
247
+ )
248
+
232
249
  @torch.inference_mode()
233
250
  def event_loop(self):
234
251
  while True:
@@ -271,6 +288,11 @@ class Scheduler:
271
288
  elif isinstance(recv_req, UpdateWeightReqInput):
272
289
  success, message = self.update_weights(recv_req)
273
290
  self.out_pyobjs.append(UpdateWeightReqOutput(success, message))
291
+ elif isinstance(recv_req, ProfileReq):
292
+ if recv_req == ProfileReq.START_PROFILE:
293
+ self.start_profile()
294
+ else:
295
+ self.stop_profile()
274
296
  else:
275
297
  raise ValueError(f"Invalid request: {recv_req}")
276
298
 
@@ -433,6 +455,9 @@ class Scheduler:
433
455
  result = self.run_batch(batch)
434
456
  self.process_batch_result(batch, result)
435
457
 
458
+ if self.running_batch.is_empty():
459
+ self.running_batch = None
460
+
436
461
  if self.running_batch is None:
437
462
  break
438
463
 
@@ -772,9 +797,6 @@ class Scheduler:
772
797
  if self.tp_rank == 0 and self.decode_forward_ct % 40 == 0:
773
798
  self.print_decode_stats()
774
799
 
775
- if self.running_batch.is_empty():
776
- self.running_batch = None
777
-
778
800
  def add_logprob_return_values(
779
801
  self,
780
802
  i: int,
@@ -1000,15 +1022,34 @@ class Scheduler:
1000
1022
  logger.error(message)
1001
1023
  return success, message
1002
1024
 
1025
+ def start_profile(self) -> None:
1026
+ if self.profiler is None:
1027
+ raise RuntimeError("Profiler is not enabled.")
1028
+ self.profiler.start()
1029
+
1030
+ def stop_profile(self) -> None:
1031
+ if self.profiler is None:
1032
+ raise RuntimeError("Profiler is not enabled.")
1033
+ self.profiler.stop()
1034
+ self.profiler.export_chrome_trace(
1035
+ self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
1036
+ )
1037
+ logger.info("Profiler is done")
1038
+
1003
1039
 
1004
1040
  def run_scheduler_process(
1005
1041
  server_args: ServerArgs,
1006
1042
  port_args: PortArgs,
1007
1043
  gpu_id: int,
1008
1044
  tp_rank: int,
1045
+ dp_rank: Optional[int],
1009
1046
  pipe_writer,
1010
1047
  ):
1011
- configure_logger(server_args, prefix=f" TP{tp_rank}")
1048
+ if dp_rank is None:
1049
+ configure_logger(server_args, prefix=f" TP{tp_rank}")
1050
+ else:
1051
+ configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
1052
+
1012
1053
  suppress_other_loggers()
1013
1054
 
1014
1055
  try:
@@ -46,6 +46,7 @@ from sglang.srt.managers.io_struct import (
46
46
  EmbeddingReqInput,
47
47
  FlushCacheReq,
48
48
  GenerateReqInput,
49
+ ProfileReq,
49
50
  RewardReqInput,
50
51
  TokenizedEmbeddingReqInput,
51
52
  TokenizedGenerateReqInput,
@@ -512,6 +513,14 @@ class TokenizerManager:
512
513
  req = AbortReq(rid)
513
514
  self.send_to_scheduler.send_pyobj(req)
514
515
 
516
+ def start_profile(self):
517
+ req = ProfileReq.START_PROFILE
518
+ self.send_to_scheduler.send_pyobj(req)
519
+
520
+ def stop_profile(self):
521
+ req = ProfileReq.STOP_PROFILE
522
+ self.send_to_scheduler.send_pyobj(req)
523
+
515
524
  async def update_weights(
516
525
  self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
517
526
  ):