sglang 0.3.4__tar.gz → 0.3.4.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. {sglang-0.3.4/sglang.egg-info → sglang-0.3.4.post2}/PKG-INFO +17 -18
  2. {sglang-0.3.4 → sglang-0.3.4.post2}/README.md +15 -16
  3. {sglang-0.3.4 → sglang-0.3.4.post2}/pyproject.toml +30 -11
  4. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/bench_latency.py +2 -1
  5. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/chat_template.py +17 -0
  6. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/launch_server_llavavid.py +1 -1
  7. sglang-0.3.4.post2/sglang/srt/configs/__init__.py +8 -0
  8. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/configs/model_config.py +27 -2
  9. sglang-0.3.4.post2/sglang/srt/configs/qwen2vl.py +133 -0
  10. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/constrained/fsm_cache.py +10 -3
  11. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/conversation.py +27 -0
  12. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/hf_transformers_utils.py +16 -1
  13. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/__init__.py +16 -5
  14. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
  15. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/flashinfer_backend.py +174 -54
  16. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_backend.py +22 -6
  17. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
  18. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/linear.py +89 -63
  19. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/logits_processor.py +5 -5
  20. sglang-0.3.4.post2/sglang/srt/layers/rotary_embedding.py +112 -0
  21. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/sampler.py +51 -39
  22. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/lora/lora.py +3 -1
  23. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/data_parallel_controller.py +1 -1
  24. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/detokenizer_manager.py +4 -0
  25. sglang-0.3.4.post2/sglang/srt/managers/image_processor.py +360 -0
  26. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/io_struct.py +10 -0
  27. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/schedule_batch.py +238 -68
  28. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/scheduler.py +69 -50
  29. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/tokenizer_manager.py +24 -4
  30. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/tp_worker.py +26 -111
  31. sglang-0.3.4.post2/sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
  32. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/memory_pool.py +56 -10
  33. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/radix_cache.py +4 -3
  34. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/model_executor/cuda_graph_runner.py +87 -28
  35. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/model_executor/forward_batch_info.py +83 -3
  36. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/model_executor/model_runner.py +32 -11
  37. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/chatglm.py +3 -3
  38. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/deepseek_v2.py +2 -2
  39. sglang-0.3.4.post2/sglang/srt/models/mllama.py +1004 -0
  40. sglang-0.3.4.post2/sglang/srt/models/qwen2_vl.py +724 -0
  41. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
  42. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/sampling_batch_info.py +13 -3
  43. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/sampling_params.py +5 -7
  44. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/server.py +12 -0
  45. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/server_args.py +10 -0
  46. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/utils.py +22 -0
  47. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/run_eval.py +2 -0
  48. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/runners.py +20 -1
  49. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/srt/sampling/penaltylib/utils.py +1 -0
  50. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/test_utils.py +100 -3
  51. sglang-0.3.4.post2/sglang/version.py +1 -0
  52. {sglang-0.3.4 → sglang-0.3.4.post2/sglang.egg-info}/PKG-INFO +17 -18
  53. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang.egg-info/SOURCES.txt +5 -0
  54. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang.egg-info/requires.txt +1 -1
  55. sglang-0.3.4/sglang/srt/configs/__init__.py +0 -5
  56. sglang-0.3.4/sglang/srt/managers/image_processor.py +0 -187
  57. sglang-0.3.4/sglang/version.py +0 -1
  58. {sglang-0.3.4 → sglang-0.3.4.post2}/LICENSE +0 -0
  59. {sglang-0.3.4 → sglang-0.3.4.post2}/setup.cfg +0 -0
  60. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/__init__.py +0 -0
  61. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/api.py +0 -0
  62. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/bench_server_latency.py +0 -0
  63. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/bench_serving.py +0 -0
  64. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/check_env.py +0 -0
  65. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/global_config.py +0 -0
  66. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/__init__.py +0 -0
  67. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/__init__.py +0 -0
  68. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/anthropic.py +0 -0
  69. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/base_backend.py +0 -0
  70. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/litellm.py +0 -0
  71. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/openai.py +0 -0
  72. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
  73. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/vertexai.py +0 -0
  74. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/choices.py +0 -0
  75. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/compiler.py +0 -0
  76. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/interpreter.py +0 -0
  77. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/ir.py +0 -0
  78. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/tracer.py +0 -0
  79. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/launch_server.py +0 -0
  80. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/configs/exaone.py +0 -0
  81. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/constrained/__init__.py +0 -0
  82. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/constrained/base_tool_cache.py +0 -0
  83. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/constrained/jump_forward.py +0 -0
  84. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/activation.py +0 -0
  85. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  86. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  87. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  88. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  89. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  90. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/fused_moe/layer.py +0 -0
  91. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/fused_moe/patch.py +0 -0
  92. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/layernorm.py +0 -0
  93. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/pooler.py +0 -0
  94. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/quantization/__init__.py +0 -0
  95. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
  96. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/radix_attention.py +0 -0
  97. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  98. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/lora/lora_config.py +0 -0
  99. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/lora/lora_manager.py +0 -0
  100. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/schedule_policy.py +0 -0
  101. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  102. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  103. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  104. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mm_utils.py +0 -0
  105. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/baichuan.py +0 -0
  106. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/commandr.py +0 -0
  107. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/dbrx.py +0 -0
  108. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/deepseek.py +0 -0
  109. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/exaone.py +0 -0
  110. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/gemma.py +0 -0
  111. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/gemma2.py +0 -0
  112. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
  113. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/grok.py +0 -0
  114. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/internlm2.py +0 -0
  115. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llama.py +0 -0
  116. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llama_classification.py +0 -0
  117. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llama_embedding.py +0 -0
  118. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llama_reward.py +0 -0
  119. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llava.py +0 -0
  120. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llavavid.py +0 -0
  121. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/minicpm.py +0 -0
  122. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/minicpm3.py +0 -0
  123. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/mistral.py +0 -0
  124. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/mixtral.py +0 -0
  125. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/mixtral_quant.py +0 -0
  126. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/olmo.py +0 -0
  127. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/olmoe.py +0 -0
  128. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/qwen.py +0 -0
  129. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/qwen2.py +0 -0
  130. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/qwen2_moe.py +0 -0
  131. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/stablelm.py +0 -0
  132. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/torch_native_llama.py +0 -0
  133. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/xverse.py +0 -0
  134. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/xverse_moe.py +0 -0
  135. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/yivl.py +0 -0
  136. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/openai_api/adapter.py +0 -0
  137. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/openai_api/protocol.py +0 -0
  138. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  139. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  140. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  141. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  142. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  143. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/few_shot_gsm8k.py +0 -0
  144. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  145. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_common.py +0 -0
  146. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  147. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  148. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_math.py +0 -0
  149. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  150. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  151. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/test_activation.py +0 -0
  152. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/test_layernorm.py +0 -0
  153. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/test_programs.py +0 -0
  154. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/utils.py +0 -0
  155. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang.egg-info/dependency_links.txt +0 -0
  156. {sglang-0.3.4 → sglang-0.3.4.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.4
3
+ Version: 0.3.4.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -236,7 +236,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
236
236
  Provides-Extra: srt
237
237
  Requires-Dist: sglang[runtime_common]; extra == "srt"
238
238
  Requires-Dist: torch; extra == "srt"
239
- Requires-Dist: vllm==0.5.5; extra == "srt"
239
+ Requires-Dist: vllm==0.6.3.post1; extra == "srt"
240
240
  Provides-Extra: srt-xpu
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
242
242
  Provides-Extra: openai
@@ -284,17 +284,17 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
284
284
  --------------------------------------------------------------------------------
285
285
 
286
286
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
287
- [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
287
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
288
288
 
289
289
  ## News
290
290
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
291
291
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
292
292
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
293
- - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
294
293
 
295
294
  <details>
296
295
  <summary>More</summary>
297
296
 
297
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
298
298
  - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
299
299
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
300
300
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -328,23 +328,27 @@ You can install SGLang using any of the methods below.
328
328
  pip install --upgrade pip
329
329
  pip install "sglang[all]"
330
330
 
331
- # Install FlashInfer CUDA kernels
331
+ # Install FlashInfer accelerated kernels
332
332
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
333
333
  ```
334
334
 
335
+ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
336
+
335
337
  ### Method 2: From source
336
338
  ```
337
339
  # Use the last release branch
338
- git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
340
+ git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
339
341
  cd sglang
340
342
 
341
343
  pip install --upgrade pip
342
344
  pip install -e "python[all]"
343
345
 
344
- # Install FlashInfer CUDA kernels
346
+ # Install FlashInfer accelerated kernels
345
347
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
346
348
  ```
347
349
 
350
+ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
351
+
348
352
  ### Method 3: Using docker
349
353
  The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
350
354
  Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
@@ -498,7 +502,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
498
502
  ```
499
503
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
500
504
  ```
501
- - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
505
+ - To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
506
+ - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
502
507
  - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
503
508
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
504
509
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
@@ -519,7 +524,6 @@ We also provide an inference engine **without a HTTP server**. For example,
519
524
  ```python
520
525
  import sglang as sgl
521
526
 
522
-
523
527
  def main():
524
528
  prompts = [
525
529
  "Hello, my name is",
@@ -539,12 +543,8 @@ if __name__ == "__main__":
539
543
  main()
540
544
  ```
541
545
 
542
- This can be used for:
543
-
544
- 1. **Offline Batch Inference**
545
- 2. **Building Custom Servers**
546
-
547
- You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
546
+ This can be used for offline batch inference and building custom servers.
547
+ You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
548
548
 
549
549
  ### Supported Models
550
550
 
@@ -552,7 +552,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
552
552
  - Llama / Llama 2 / Llama 3 / Llama 3.1
553
553
  - Mistral / Mixtral / Mistral NeMo
554
554
  - Gemma / Gemma 2
555
- - Qwen / Qwen 2 / Qwen 2 MoE
555
+ - Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
556
556
  - DeepSeek / DeepSeek 2
557
557
  - OLMoE
558
558
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -575,6 +575,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
575
575
  - MiniCPM / MiniCPM 3
576
576
  - XVERSE / XVERSE MoE
577
577
  - SmolLM
578
+ - GLM-4
578
579
 
579
580
  **Embedding Models**
580
581
 
@@ -711,7 +712,6 @@ print(state["answer_1"])
711
712
  ```
712
713
 
713
714
  #### More Examples
714
-
715
715
  Anthropic and VertexAI (Gemini) models are also supported.
716
716
  You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
717
717
 
@@ -892,7 +892,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
892
892
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
893
893
 
894
894
 
895
-
896
895
  <p align="center">
897
896
  <a href="#sglangtop" target="_blank">
898
897
  <bold>Back To Top </bold>
@@ -12,17 +12,17 @@
12
12
  --------------------------------------------------------------------------------
13
13
 
14
14
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
15
- [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
15
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
16
16
 
17
17
  ## News
18
18
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
19
19
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
20
20
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
21
- - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
22
21
 
23
22
  <details>
24
23
  <summary>More</summary>
25
24
 
25
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
26
26
  - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
27
27
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
28
28
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -56,23 +56,27 @@ You can install SGLang using any of the methods below.
56
56
  pip install --upgrade pip
57
57
  pip install "sglang[all]"
58
58
 
59
- # Install FlashInfer CUDA kernels
59
+ # Install FlashInfer accelerated kernels
60
60
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
61
61
  ```
62
62
 
63
+ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
64
+
63
65
  ### Method 2: From source
64
66
  ```
65
67
  # Use the last release branch
66
- git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
68
+ git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
67
69
  cd sglang
68
70
 
69
71
  pip install --upgrade pip
70
72
  pip install -e "python[all]"
71
73
 
72
- # Install FlashInfer CUDA kernels
74
+ # Install FlashInfer accelerated kernels
73
75
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
74
76
  ```
75
77
 
78
+ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
79
+
76
80
  ### Method 3: Using docker
77
81
  The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
78
82
  Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
@@ -226,7 +230,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
226
230
  ```
227
231
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
228
232
  ```
229
- - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
233
+ - To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
234
+ - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
230
235
  - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
231
236
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
232
237
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
@@ -247,7 +252,6 @@ We also provide an inference engine **without a HTTP server**. For example,
247
252
  ```python
248
253
  import sglang as sgl
249
254
 
250
-
251
255
  def main():
252
256
  prompts = [
253
257
  "Hello, my name is",
@@ -267,12 +271,8 @@ if __name__ == "__main__":
267
271
  main()
268
272
  ```
269
273
 
270
- This can be used for:
271
-
272
- 1. **Offline Batch Inference**
273
- 2. **Building Custom Servers**
274
-
275
- You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
274
+ This can be used for offline batch inference and building custom servers.
275
+ You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
276
276
 
277
277
  ### Supported Models
278
278
 
@@ -280,7 +280,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
280
280
  - Llama / Llama 2 / Llama 3 / Llama 3.1
281
281
  - Mistral / Mixtral / Mistral NeMo
282
282
  - Gemma / Gemma 2
283
- - Qwen / Qwen 2 / Qwen 2 MoE
283
+ - Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
284
284
  - DeepSeek / DeepSeek 2
285
285
  - OLMoE
286
286
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -303,6 +303,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
303
303
  - MiniCPM / MiniCPM 3
304
304
  - XVERSE / XVERSE MoE
305
305
  - SmolLM
306
+ - GLM-4
306
307
 
307
308
  **Embedding Models**
308
309
 
@@ -439,7 +440,6 @@ print(state["answer_1"])
439
440
  ```
440
441
 
441
442
  #### More Examples
442
-
443
443
  Anthropic and VertexAI (Gemini) models are also supported.
444
444
  You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
445
445
 
@@ -620,7 +620,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
620
620
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
621
621
 
622
622
 
623
-
624
623
  <p align="center">
625
624
  <a href="#sglangtop" target="_blank">
626
625
  <bold>Back To Top </bold>
@@ -4,20 +4,16 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.4"
7
+ version = "0.3.4.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
11
- license = {file = "LICENSE"}
11
+ license = { file = "LICENSE" }
12
12
  classifiers = [
13
13
  "Programming Language :: Python :: 3",
14
14
  "License :: OSI Approved :: Apache Software License",
15
15
  ]
16
- dependencies = [
17
- "requests",
18
- "tqdm",
19
- "numpy",
20
- ]
16
+ dependencies = ["requests", "tqdm", "numpy"]
21
17
 
22
18
  [project.optional-dependencies]
23
19
  runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
@@ -26,13 +22,20 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu
26
22
  "outlines>=0.0.44", "modelscope"]
27
23
  # xpu is not enabled in public vllm and torch whl,
28
24
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
29
- srt = ["sglang[runtime_common]", "torch", "vllm==0.5.5"]
25
+ srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
30
26
  srt_xpu = ["sglang[runtime_common]"]
31
27
 
32
28
  openai = ["openai>=1.0", "tiktoken"]
33
29
  anthropic = ["anthropic>=0.20.0"]
34
30
  litellm = ["litellm>=1.0.0"]
35
- test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate", "peft"]
31
+ test = [
32
+ "jsonlines",
33
+ "matplotlib",
34
+ "pandas",
35
+ "sentence_transformers",
36
+ "accelerate",
37
+ "peft",
38
+ ]
36
39
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
37
40
  all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
38
41
  dev = ["sglang[all]", "sglang[test]"]
@@ -43,7 +46,23 @@ dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
43
46
  "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
44
47
 
45
48
  [tool.setuptools.packages.find]
46
- exclude = ["assets*", "benchmark*", "docs*", "dist*", "playground*", "scripts*", "tests*"]
49
+ exclude = [
50
+ "assets*",
51
+ "benchmark*",
52
+ "docs*",
53
+ "dist*",
54
+ "playground*",
55
+ "scripts*",
56
+ "tests*",
57
+ ]
47
58
 
48
59
  [tool.wheel]
49
- exclude = ["assets*", "benchmark*", "docs*", "dist*", "playground*", "scripts*", "tests*"]
60
+ exclude = [
61
+ "assets*",
62
+ "benchmark*",
63
+ "docs*",
64
+ "dist*",
65
+ "playground*",
66
+ "scripts*",
67
+ "tests*",
68
+ ]
@@ -227,8 +227,9 @@ def extend(reqs, model_runner):
227
227
  req_to_token_pool=model_runner.req_to_token_pool,
228
228
  token_to_kv_pool=model_runner.token_to_kv_pool,
229
229
  tree_cache=None,
230
+ model_config=model_runner.model_config,
230
231
  )
231
- batch.prepare_for_extend(model_runner.model_config.vocab_size)
232
+ batch.prepare_for_extend()
232
233
  model_worker_batch = batch.get_model_worker_batch()
233
234
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
234
235
  logits_output = model_runner.forward(forward_batch)
@@ -133,6 +133,22 @@ register_chat_template(
133
133
  )
134
134
  )
135
135
 
136
+ # Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
137
+ register_chat_template(
138
+ ChatTemplate(
139
+ name="qwen2-vl",
140
+ default_system_prompt="You are a helpful assistant.",
141
+ role_prefix_and_suffix={
142
+ "system": ("<|im_start|>system\n", "<|im_end|>\n"),
143
+ "user": ("<|im_start|>user\n", "<|im_end|>\n"),
144
+ "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
145
+ },
146
+ style=ChatTemplateStyle.PLAIN,
147
+ stop_str=("<|im_end|>"),
148
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
149
+ )
150
+ )
151
+
136
152
 
137
153
  register_chat_template(
138
154
  ChatTemplate(
@@ -213,6 +229,7 @@ register_chat_template(
213
229
  ),
214
230
  },
215
231
  stop_str=("<|eot_id|>",),
232
+ image_token="<|image|>",
216
233
  )
217
234
  )
218
235
 
@@ -14,7 +14,7 @@ if __name__ == "__main__":
14
14
  model_override_args["num_frames"] = 16
15
15
  model_override_args["model_type"] = "llavavid"
16
16
  if model_override_args["num_frames"] == 32:
17
- model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
17
+ model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"}
18
18
  model_override_args["max_sequence_length"] = 4096 * 2
19
19
  model_override_args["tokenizer_model_max_length"] = 4096 * 2
20
20
  model_override_args["model_max_length"] = 4096 * 2
@@ -0,0 +1,8 @@
1
+ from sglang.srt.configs.exaone import ExaoneConfig
2
+ from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig
3
+
4
+ __all__ = [
5
+ "ExaoneConfig",
6
+ "Qwen2VLConfig",
7
+ "Qwen2VLVisionConfig",
8
+ ]
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
13
13
  limitations under the License.
14
14
  """
15
15
 
16
+ import logging
17
+ import os
16
18
  from enum import IntEnum, auto
17
19
  from typing import Optional
18
20
 
@@ -20,6 +22,8 @@ from transformers import PretrainedConfig
20
22
 
21
23
  from sglang.srt.hf_transformers_utils import get_config, get_context_length
22
24
 
25
+ logger = logging.getLogger(__name__)
26
+
23
27
 
24
28
  class AttentionArch(IntEnum):
25
29
  MLA = auto()
@@ -46,10 +50,29 @@ class ModelConfig:
46
50
  model_override_args=model_override_args,
47
51
  )
48
52
  self.hf_text_config = get_hf_text_config(self.hf_config)
53
+ derived_context_len = get_context_length(self.hf_text_config)
54
+ allow_long_context = os.environ.get(
55
+ "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
56
+ )
57
+
49
58
  if context_length is not None:
50
- self.context_len = context_length
59
+ if context_length > derived_context_len:
60
+ if allow_long_context:
61
+ logger.warning(
62
+ f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
63
+ f"This may lead to incorrect model outputs or CUDA errors."
64
+ )
65
+ self.context_len = context_length
66
+ else:
67
+ raise ValueError(
68
+ f"User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
69
+ f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config. "
70
+ f"To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
71
+ )
72
+ else:
73
+ self.context_len = context_length
51
74
  else:
52
- self.context_len = get_context_length(self.hf_text_config)
75
+ self.context_len = derived_context_len
53
76
 
54
77
  # Unify the config keys for hf_text_config
55
78
  self.head_dim = getattr(
@@ -89,6 +112,8 @@ class ModelConfig:
89
112
  self.num_hidden_layers = self.hf_text_config.num_hidden_layers
90
113
  self.vocab_size = self.hf_text_config.vocab_size
91
114
 
115
+ self.is_encoder_decoder = self.hf_config.model_type in ["mllama"]
116
+
92
117
  # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
93
118
  def get_total_num_kv_heads(self) -> int:
94
119
  """Returns the total number of KV heads."""
@@ -0,0 +1,133 @@
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
3
+ # All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """Qwen2VL model configuration"""
17
+
18
+ import os
19
+ from typing import Union
20
+
21
+ from transformers import PretrainedConfig
22
+
23
+
24
+ class Qwen2VLVisionConfig(PretrainedConfig):
25
+ model_type = "qwen2_vl"
26
+
27
+ def __init__(
28
+ self,
29
+ depth=32,
30
+ embed_dim=1280,
31
+ hidden_size=3584,
32
+ hidden_act="quick_gelu",
33
+ mlp_ratio=4,
34
+ num_heads=16,
35
+ in_channels=3,
36
+ patch_size=14,
37
+ spatial_merge_size=2,
38
+ temporal_patch_size=2,
39
+ **kwargs,
40
+ ):
41
+ super().__init__(**kwargs)
42
+
43
+ self.depth = depth
44
+ self.embed_dim = embed_dim
45
+ self.hidden_size = hidden_size
46
+ self.hidden_act = hidden_act
47
+ self.mlp_ratio = mlp_ratio
48
+ self.num_heads = num_heads
49
+ self.in_channels = in_channels
50
+ self.patch_size = patch_size
51
+ self.spatial_merge_size = spatial_merge_size
52
+ self.temporal_patch_size = temporal_patch_size
53
+
54
+ @classmethod
55
+ def from_pretrained(
56
+ cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
57
+ ) -> "PretrainedConfig":
58
+ cls._set_token_in_kwargs(kwargs)
59
+
60
+ config_dict, kwargs = cls.get_config_dict(
61
+ pretrained_model_name_or_path, **kwargs
62
+ )
63
+
64
+ if config_dict.get("model_type") == "qwen2_vl":
65
+ config_dict = config_dict["vision_config"]
66
+
67
+ return cls.from_dict(config_dict, **kwargs)
68
+
69
+
70
+ class Qwen2VLConfig(PretrainedConfig):
71
+ model_type = "qwen2_vl"
72
+
73
+ def __init__(
74
+ self,
75
+ vocab_size=152064,
76
+ hidden_size=8192,
77
+ intermediate_size=29568,
78
+ num_hidden_layers=80,
79
+ num_attention_heads=64,
80
+ num_key_value_heads=8,
81
+ hidden_act="silu",
82
+ max_position_embeddings=32768,
83
+ initializer_range=0.02,
84
+ rms_norm_eps=1e-05,
85
+ use_cache=True,
86
+ tie_word_embeddings=False,
87
+ rope_theta=1000000.0,
88
+ use_sliding_window=False,
89
+ sliding_window=4096,
90
+ max_window_layers=80,
91
+ attention_dropout=0.0,
92
+ vision_config=None,
93
+ rope_scaling=None,
94
+ **kwargs,
95
+ ):
96
+ if isinstance(vision_config, dict):
97
+ self.vision_config = Qwen2VLVisionConfig(**vision_config)
98
+ elif vision_config is None:
99
+ self.vision_config = Qwen2VLVisionConfig()
100
+
101
+ self.vocab_size = vocab_size
102
+ self.max_position_embeddings = max_position_embeddings
103
+ self.hidden_size = hidden_size
104
+ self.intermediate_size = intermediate_size
105
+ self.num_hidden_layers = num_hidden_layers
106
+ self.num_attention_heads = num_attention_heads
107
+ self.use_sliding_window = use_sliding_window
108
+ self.sliding_window = sliding_window
109
+ self.max_window_layers = max_window_layers
110
+
111
+ # for backward compatibility
112
+ if num_key_value_heads is None:
113
+ num_key_value_heads = num_attention_heads
114
+
115
+ self.num_key_value_heads = num_key_value_heads
116
+ self.hidden_act = hidden_act
117
+ self.initializer_range = initializer_range
118
+ self.rms_norm_eps = rms_norm_eps
119
+ self.use_cache = use_cache
120
+ self.rope_theta = rope_theta
121
+ self.attention_dropout = attention_dropout
122
+ self.rope_scaling = rope_scaling
123
+
124
+ # NOTE: the following section from original transformers config
125
+ # for Qwen2-VL is commented out to address rope config loading issue
126
+ #
127
+ # if self.rope_scaling is not None and "type" in self.rope_scaling:
128
+ # if self.rope_scaling["type"] == "mrope":
129
+ # self.rope_scaling["type"] = "default"
130
+ # self.rope_scaling["rope_type"] = self.rope_scaling["type"]
131
+ # rope_config_validation(self)
132
+
133
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
@@ -73,9 +73,16 @@ class FSMCache(BaseToolCache):
73
73
  def init_value(self, key):
74
74
  key_type, key_string = key
75
75
  if key_type == "json":
76
- regex = build_regex_from_schema(
77
- key_string, whitespace_pattern=self.constrained_json_whitespace_pattern
78
- )
76
+ try:
77
+ regex = build_regex_from_schema(
78
+ key_string,
79
+ whitespace_pattern=self.constrained_json_whitespace_pattern,
80
+ )
81
+ except NotImplementedError as e:
82
+ logger.warning(
83
+ f"skip invalid json schema: json_schema={key_string}, {e=}"
84
+ )
85
+ return None, key_string
79
86
  elif key_type == "regex":
80
87
  regex = key_string
81
88
  else:
@@ -509,6 +509,19 @@ register_conv_template(
509
509
  )
510
510
  )
511
511
 
512
+ register_conv_template(
513
+ Conversation(
514
+ name="llama_3_vision",
515
+ system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
516
+ system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
517
+ roles=("user", "assistant"),
518
+ sep_style=SeparatorStyle.LLAMA3,
519
+ sep="",
520
+ stop_str=["<|end_of_text|>", "<|eot_id|>"],
521
+ image_token="<|image|>",
522
+ )
523
+ )
524
+
512
525
  register_conv_template(
513
526
  Conversation(
514
527
  name="llava_llama_3",
@@ -530,3 +543,17 @@ register_conv_template(
530
543
  stop_str=["<|im_end|>", "<|action_end|>"],
531
544
  )
532
545
  )
546
+
547
+ # Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
548
+ register_conv_template(
549
+ Conversation(
550
+ name="qwen2-vl",
551
+ system_message="You are a helpful assistant.",
552
+ system_template="<|im_start|>system\n{system_message}",
553
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
554
+ sep="<|im_end|>\n",
555
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
556
+ stop_str=["<|im_end|>"],
557
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
558
+ )
559
+ )