sglang 0.3.4__tar.gz → 0.3.4.post2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.4/sglang.egg-info → sglang-0.3.4.post2}/PKG-INFO +17 -18
- {sglang-0.3.4 → sglang-0.3.4.post2}/README.md +15 -16
- {sglang-0.3.4 → sglang-0.3.4.post2}/pyproject.toml +30 -11
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/bench_latency.py +2 -1
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/chat_template.py +17 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/launch_server_llavavid.py +1 -1
- sglang-0.3.4.post2/sglang/srt/configs/__init__.py +8 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/configs/model_config.py +27 -2
- sglang-0.3.4.post2/sglang/srt/configs/qwen2vl.py +133 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/constrained/fsm_cache.py +10 -3
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/conversation.py +27 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/hf_transformers_utils.py +16 -1
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/__init__.py +16 -5
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/flashinfer_backend.py +174 -54
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_backend.py +22 -6
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/linear.py +89 -63
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/logits_processor.py +5 -5
- sglang-0.3.4.post2/sglang/srt/layers/rotary_embedding.py +112 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/sampler.py +51 -39
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/lora/lora.py +3 -1
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/data_parallel_controller.py +1 -1
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/detokenizer_manager.py +4 -0
- sglang-0.3.4.post2/sglang/srt/managers/image_processor.py +360 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/io_struct.py +10 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/schedule_batch.py +238 -68
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/scheduler.py +69 -50
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/tokenizer_manager.py +24 -4
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/tp_worker.py +26 -111
- sglang-0.3.4.post2/sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/memory_pool.py +56 -10
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/radix_cache.py +4 -3
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/model_executor/cuda_graph_runner.py +87 -28
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/model_executor/forward_batch_info.py +83 -3
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/model_executor/model_runner.py +32 -11
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/chatglm.py +3 -3
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/deepseek_v2.py +2 -2
- sglang-0.3.4.post2/sglang/srt/models/mllama.py +1004 -0
- sglang-0.3.4.post2/sglang/srt/models/qwen2_vl.py +724 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/sampling_batch_info.py +13 -3
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/sampling_params.py +5 -7
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/server.py +12 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/server_args.py +10 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/utils.py +22 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/run_eval.py +2 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/runners.py +20 -1
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/srt/sampling/penaltylib/utils.py +1 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/test_utils.py +100 -3
- sglang-0.3.4.post2/sglang/version.py +1 -0
- {sglang-0.3.4 → sglang-0.3.4.post2/sglang.egg-info}/PKG-INFO +17 -18
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang.egg-info/SOURCES.txt +5 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang.egg-info/requires.txt +1 -1
- sglang-0.3.4/sglang/srt/configs/__init__.py +0 -5
- sglang-0.3.4/sglang/srt/managers/image_processor.py +0 -187
- sglang-0.3.4/sglang/version.py +0 -1
- {sglang-0.3.4 → sglang-0.3.4.post2}/LICENSE +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/setup.cfg +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/__init__.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/api.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/bench_server_latency.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/bench_serving.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/check_env.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/global_config.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/choices.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/ir.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/launch_server.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/fused_moe/patch.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/grok.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llama.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llava.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/openai_api/adapter.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang/utils.py +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.4 → sglang-0.3.4.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.4
|
3
|
+
Version: 0.3.4.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -236,7 +236,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
|
|
236
236
|
Provides-Extra: srt
|
237
237
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
238
238
|
Requires-Dist: torch; extra == "srt"
|
239
|
-
Requires-Dist: vllm==0.
|
239
|
+
Requires-Dist: vllm==0.6.3.post1; extra == "srt"
|
240
240
|
Provides-Extra: srt-xpu
|
241
241
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
242
242
|
Provides-Extra: openai
|
@@ -284,17 +284,17 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
|
|
284
284
|
--------------------------------------------------------------------------------
|
285
285
|
|
286
286
|
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
287
|
-
[**Join Bi-Weekly Development Meeting
|
287
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
288
288
|
|
289
289
|
## News
|
290
290
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
291
291
|
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
292
292
|
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
293
|
-
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
294
293
|
|
295
294
|
<details>
|
296
295
|
<summary>More</summary>
|
297
296
|
|
297
|
+
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
298
298
|
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
299
299
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
300
300
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
@@ -328,23 +328,27 @@ You can install SGLang using any of the methods below.
|
|
328
328
|
pip install --upgrade pip
|
329
329
|
pip install "sglang[all]"
|
330
330
|
|
331
|
-
# Install FlashInfer
|
331
|
+
# Install FlashInfer accelerated kernels
|
332
332
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
333
333
|
```
|
334
334
|
|
335
|
+
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
|
336
|
+
|
335
337
|
### Method 2: From source
|
336
338
|
```
|
337
339
|
# Use the last release branch
|
338
|
-
git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
|
340
|
+
git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
|
339
341
|
cd sglang
|
340
342
|
|
341
343
|
pip install --upgrade pip
|
342
344
|
pip install -e "python[all]"
|
343
345
|
|
344
|
-
# Install FlashInfer
|
346
|
+
# Install FlashInfer accelerated kernels
|
345
347
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
346
348
|
```
|
347
349
|
|
350
|
+
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
|
351
|
+
|
348
352
|
### Method 3: Using docker
|
349
353
|
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
|
350
354
|
Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
@@ -498,7 +502,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
498
502
|
```
|
499
503
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
500
504
|
```
|
501
|
-
- To enable
|
505
|
+
- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
|
506
|
+
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
|
502
507
|
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
503
508
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
504
509
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
@@ -519,7 +524,6 @@ We also provide an inference engine **without a HTTP server**. For example,
|
|
519
524
|
```python
|
520
525
|
import sglang as sgl
|
521
526
|
|
522
|
-
|
523
527
|
def main():
|
524
528
|
prompts = [
|
525
529
|
"Hello, my name is",
|
@@ -539,12 +543,8 @@ if __name__ == "__main__":
|
|
539
543
|
main()
|
540
544
|
```
|
541
545
|
|
542
|
-
This can be used for
|
543
|
-
|
544
|
-
1. **Offline Batch Inference**
|
545
|
-
2. **Building Custom Servers**
|
546
|
-
|
547
|
-
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
|
546
|
+
This can be used for offline batch inference and building custom servers.
|
547
|
+
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
|
548
548
|
|
549
549
|
### Supported Models
|
550
550
|
|
@@ -552,7 +552,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
|
|
552
552
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
553
553
|
- Mistral / Mixtral / Mistral NeMo
|
554
554
|
- Gemma / Gemma 2
|
555
|
-
- Qwen / Qwen 2 / Qwen 2 MoE
|
555
|
+
- Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
|
556
556
|
- DeepSeek / DeepSeek 2
|
557
557
|
- OLMoE
|
558
558
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
@@ -575,6 +575,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
|
|
575
575
|
- MiniCPM / MiniCPM 3
|
576
576
|
- XVERSE / XVERSE MoE
|
577
577
|
- SmolLM
|
578
|
+
- GLM-4
|
578
579
|
|
579
580
|
**Embedding Models**
|
580
581
|
|
@@ -711,7 +712,6 @@ print(state["answer_1"])
|
|
711
712
|
```
|
712
713
|
|
713
714
|
#### More Examples
|
714
|
-
|
715
715
|
Anthropic and VertexAI (Gemini) models are also supported.
|
716
716
|
You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
|
717
717
|
|
@@ -892,7 +892,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
|
|
892
892
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
893
893
|
|
894
894
|
|
895
|
-
|
896
895
|
<p align="center">
|
897
896
|
<a href="#sglangtop" target="_blank">
|
898
897
|
<bold>Back To Top </bold>
|
@@ -12,17 +12,17 @@
|
|
12
12
|
--------------------------------------------------------------------------------
|
13
13
|
|
14
14
|
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
15
|
-
[**Join Bi-Weekly Development Meeting
|
15
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
16
16
|
|
17
17
|
## News
|
18
18
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
19
19
|
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
20
20
|
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
21
|
-
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
22
21
|
|
23
22
|
<details>
|
24
23
|
<summary>More</summary>
|
25
24
|
|
25
|
+
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
26
26
|
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
27
27
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
28
28
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
@@ -56,23 +56,27 @@ You can install SGLang using any of the methods below.
|
|
56
56
|
pip install --upgrade pip
|
57
57
|
pip install "sglang[all]"
|
58
58
|
|
59
|
-
# Install FlashInfer
|
59
|
+
# Install FlashInfer accelerated kernels
|
60
60
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
61
61
|
```
|
62
62
|
|
63
|
+
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
|
64
|
+
|
63
65
|
### Method 2: From source
|
64
66
|
```
|
65
67
|
# Use the last release branch
|
66
|
-
git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
|
68
|
+
git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
|
67
69
|
cd sglang
|
68
70
|
|
69
71
|
pip install --upgrade pip
|
70
72
|
pip install -e "python[all]"
|
71
73
|
|
72
|
-
# Install FlashInfer
|
74
|
+
# Install FlashInfer accelerated kernels
|
73
75
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
74
76
|
```
|
75
77
|
|
78
|
+
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
|
79
|
+
|
76
80
|
### Method 3: Using docker
|
77
81
|
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
|
78
82
|
Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
@@ -226,7 +230,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
226
230
|
```
|
227
231
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
228
232
|
```
|
229
|
-
- To enable
|
233
|
+
- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
|
234
|
+
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
|
230
235
|
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
231
236
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
232
237
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
@@ -247,7 +252,6 @@ We also provide an inference engine **without a HTTP server**. For example,
|
|
247
252
|
```python
|
248
253
|
import sglang as sgl
|
249
254
|
|
250
|
-
|
251
255
|
def main():
|
252
256
|
prompts = [
|
253
257
|
"Hello, my name is",
|
@@ -267,12 +271,8 @@ if __name__ == "__main__":
|
|
267
271
|
main()
|
268
272
|
```
|
269
273
|
|
270
|
-
This can be used for
|
271
|
-
|
272
|
-
1. **Offline Batch Inference**
|
273
|
-
2. **Building Custom Servers**
|
274
|
-
|
275
|
-
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
|
274
|
+
This can be used for offline batch inference and building custom servers.
|
275
|
+
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
|
276
276
|
|
277
277
|
### Supported Models
|
278
278
|
|
@@ -280,7 +280,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
|
|
280
280
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
281
281
|
- Mistral / Mixtral / Mistral NeMo
|
282
282
|
- Gemma / Gemma 2
|
283
|
-
- Qwen / Qwen 2 / Qwen 2 MoE
|
283
|
+
- Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
|
284
284
|
- DeepSeek / DeepSeek 2
|
285
285
|
- OLMoE
|
286
286
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
@@ -303,6 +303,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
|
|
303
303
|
- MiniCPM / MiniCPM 3
|
304
304
|
- XVERSE / XVERSE MoE
|
305
305
|
- SmolLM
|
306
|
+
- GLM-4
|
306
307
|
|
307
308
|
**Embedding Models**
|
308
309
|
|
@@ -439,7 +440,6 @@ print(state["answer_1"])
|
|
439
440
|
```
|
440
441
|
|
441
442
|
#### More Examples
|
442
|
-
|
443
443
|
Anthropic and VertexAI (Gemini) models are also supported.
|
444
444
|
You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
|
445
445
|
|
@@ -620,7 +620,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
|
|
620
620
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
621
621
|
|
622
622
|
|
623
|
-
|
624
623
|
<p align="center">
|
625
624
|
<a href="#sglangtop" target="_blank">
|
626
625
|
<bold>Back To Top </bold>
|
@@ -4,20 +4,16 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.4"
|
7
|
+
version = "0.3.4.post2"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
11
|
-
license = {file = "LICENSE"}
|
11
|
+
license = { file = "LICENSE" }
|
12
12
|
classifiers = [
|
13
13
|
"Programming Language :: Python :: 3",
|
14
14
|
"License :: OSI Approved :: Apache Software License",
|
15
15
|
]
|
16
|
-
dependencies = [
|
17
|
-
"requests",
|
18
|
-
"tqdm",
|
19
|
-
"numpy",
|
20
|
-
]
|
16
|
+
dependencies = ["requests", "tqdm", "numpy"]
|
21
17
|
|
22
18
|
[project.optional-dependencies]
|
23
19
|
runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
@@ -26,13 +22,20 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu
|
|
26
22
|
"outlines>=0.0.44", "modelscope"]
|
27
23
|
# xpu is not enabled in public vllm and torch whl,
|
28
24
|
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
29
|
-
srt = ["sglang[runtime_common]", "torch", "vllm==0.
|
25
|
+
srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
|
30
26
|
srt_xpu = ["sglang[runtime_common]"]
|
31
27
|
|
32
28
|
openai = ["openai>=1.0", "tiktoken"]
|
33
29
|
anthropic = ["anthropic>=0.20.0"]
|
34
30
|
litellm = ["litellm>=1.0.0"]
|
35
|
-
test = [
|
31
|
+
test = [
|
32
|
+
"jsonlines",
|
33
|
+
"matplotlib",
|
34
|
+
"pandas",
|
35
|
+
"sentence_transformers",
|
36
|
+
"accelerate",
|
37
|
+
"peft",
|
38
|
+
]
|
36
39
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
37
40
|
all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
38
41
|
dev = ["sglang[all]", "sglang[test]"]
|
@@ -43,7 +46,23 @@ dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
|
|
43
46
|
"Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
|
44
47
|
|
45
48
|
[tool.setuptools.packages.find]
|
46
|
-
exclude = [
|
49
|
+
exclude = [
|
50
|
+
"assets*",
|
51
|
+
"benchmark*",
|
52
|
+
"docs*",
|
53
|
+
"dist*",
|
54
|
+
"playground*",
|
55
|
+
"scripts*",
|
56
|
+
"tests*",
|
57
|
+
]
|
47
58
|
|
48
59
|
[tool.wheel]
|
49
|
-
exclude = [
|
60
|
+
exclude = [
|
61
|
+
"assets*",
|
62
|
+
"benchmark*",
|
63
|
+
"docs*",
|
64
|
+
"dist*",
|
65
|
+
"playground*",
|
66
|
+
"scripts*",
|
67
|
+
"tests*",
|
68
|
+
]
|
@@ -227,8 +227,9 @@ def extend(reqs, model_runner):
|
|
227
227
|
req_to_token_pool=model_runner.req_to_token_pool,
|
228
228
|
token_to_kv_pool=model_runner.token_to_kv_pool,
|
229
229
|
tree_cache=None,
|
230
|
+
model_config=model_runner.model_config,
|
230
231
|
)
|
231
|
-
batch.prepare_for_extend(
|
232
|
+
batch.prepare_for_extend()
|
232
233
|
model_worker_batch = batch.get_model_worker_batch()
|
233
234
|
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
234
235
|
logits_output = model_runner.forward(forward_batch)
|
@@ -133,6 +133,22 @@ register_chat_template(
|
|
133
133
|
)
|
134
134
|
)
|
135
135
|
|
136
|
+
# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
|
137
|
+
register_chat_template(
|
138
|
+
ChatTemplate(
|
139
|
+
name="qwen2-vl",
|
140
|
+
default_system_prompt="You are a helpful assistant.",
|
141
|
+
role_prefix_and_suffix={
|
142
|
+
"system": ("<|im_start|>system\n", "<|im_end|>\n"),
|
143
|
+
"user": ("<|im_start|>user\n", "<|im_end|>\n"),
|
144
|
+
"assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
|
145
|
+
},
|
146
|
+
style=ChatTemplateStyle.PLAIN,
|
147
|
+
stop_str=("<|im_end|>"),
|
148
|
+
image_token="<|vision_start|><|image_pad|><|vision_end|>",
|
149
|
+
)
|
150
|
+
)
|
151
|
+
|
136
152
|
|
137
153
|
register_chat_template(
|
138
154
|
ChatTemplate(
|
@@ -213,6 +229,7 @@ register_chat_template(
|
|
213
229
|
),
|
214
230
|
},
|
215
231
|
stop_str=("<|eot_id|>",),
|
232
|
+
image_token="<|image|>",
|
216
233
|
)
|
217
234
|
)
|
218
235
|
|
@@ -14,7 +14,7 @@ if __name__ == "__main__":
|
|
14
14
|
model_override_args["num_frames"] = 16
|
15
15
|
model_override_args["model_type"] = "llavavid"
|
16
16
|
if model_override_args["num_frames"] == 32:
|
17
|
-
model_override_args["rope_scaling"] = {"factor": 2.0, "
|
17
|
+
model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"}
|
18
18
|
model_override_args["max_sequence_length"] = 4096 * 2
|
19
19
|
model_override_args["tokenizer_model_max_length"] = 4096 * 2
|
20
20
|
model_override_args["model_max_length"] = 4096 * 2
|
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
|
|
13
13
|
limitations under the License.
|
14
14
|
"""
|
15
15
|
|
16
|
+
import logging
|
17
|
+
import os
|
16
18
|
from enum import IntEnum, auto
|
17
19
|
from typing import Optional
|
18
20
|
|
@@ -20,6 +22,8 @@ from transformers import PretrainedConfig
|
|
20
22
|
|
21
23
|
from sglang.srt.hf_transformers_utils import get_config, get_context_length
|
22
24
|
|
25
|
+
logger = logging.getLogger(__name__)
|
26
|
+
|
23
27
|
|
24
28
|
class AttentionArch(IntEnum):
|
25
29
|
MLA = auto()
|
@@ -46,10 +50,29 @@ class ModelConfig:
|
|
46
50
|
model_override_args=model_override_args,
|
47
51
|
)
|
48
52
|
self.hf_text_config = get_hf_text_config(self.hf_config)
|
53
|
+
derived_context_len = get_context_length(self.hf_text_config)
|
54
|
+
allow_long_context = os.environ.get(
|
55
|
+
"SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
|
56
|
+
)
|
57
|
+
|
49
58
|
if context_length is not None:
|
50
|
-
|
59
|
+
if context_length > derived_context_len:
|
60
|
+
if allow_long_context:
|
61
|
+
logger.warning(
|
62
|
+
f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
|
63
|
+
f"This may lead to incorrect model outputs or CUDA errors."
|
64
|
+
)
|
65
|
+
self.context_len = context_length
|
66
|
+
else:
|
67
|
+
raise ValueError(
|
68
|
+
f"User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
|
69
|
+
f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config. "
|
70
|
+
f"To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
|
71
|
+
)
|
72
|
+
else:
|
73
|
+
self.context_len = context_length
|
51
74
|
else:
|
52
|
-
self.context_len =
|
75
|
+
self.context_len = derived_context_len
|
53
76
|
|
54
77
|
# Unify the config keys for hf_text_config
|
55
78
|
self.head_dim = getattr(
|
@@ -89,6 +112,8 @@ class ModelConfig:
|
|
89
112
|
self.num_hidden_layers = self.hf_text_config.num_hidden_layers
|
90
113
|
self.vocab_size = self.hf_text_config.vocab_size
|
91
114
|
|
115
|
+
self.is_encoder_decoder = self.hf_config.model_type in ["mllama"]
|
116
|
+
|
92
117
|
# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
|
93
118
|
def get_total_num_kv_heads(self) -> int:
|
94
119
|
"""Returns the total number of KV heads."""
|
@@ -0,0 +1,133 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
|
3
|
+
# All rights reserved.
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
"""Qwen2VL model configuration"""
|
17
|
+
|
18
|
+
import os
|
19
|
+
from typing import Union
|
20
|
+
|
21
|
+
from transformers import PretrainedConfig
|
22
|
+
|
23
|
+
|
24
|
+
class Qwen2VLVisionConfig(PretrainedConfig):
|
25
|
+
model_type = "qwen2_vl"
|
26
|
+
|
27
|
+
def __init__(
|
28
|
+
self,
|
29
|
+
depth=32,
|
30
|
+
embed_dim=1280,
|
31
|
+
hidden_size=3584,
|
32
|
+
hidden_act="quick_gelu",
|
33
|
+
mlp_ratio=4,
|
34
|
+
num_heads=16,
|
35
|
+
in_channels=3,
|
36
|
+
patch_size=14,
|
37
|
+
spatial_merge_size=2,
|
38
|
+
temporal_patch_size=2,
|
39
|
+
**kwargs,
|
40
|
+
):
|
41
|
+
super().__init__(**kwargs)
|
42
|
+
|
43
|
+
self.depth = depth
|
44
|
+
self.embed_dim = embed_dim
|
45
|
+
self.hidden_size = hidden_size
|
46
|
+
self.hidden_act = hidden_act
|
47
|
+
self.mlp_ratio = mlp_ratio
|
48
|
+
self.num_heads = num_heads
|
49
|
+
self.in_channels = in_channels
|
50
|
+
self.patch_size = patch_size
|
51
|
+
self.spatial_merge_size = spatial_merge_size
|
52
|
+
self.temporal_patch_size = temporal_patch_size
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def from_pretrained(
|
56
|
+
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
|
57
|
+
) -> "PretrainedConfig":
|
58
|
+
cls._set_token_in_kwargs(kwargs)
|
59
|
+
|
60
|
+
config_dict, kwargs = cls.get_config_dict(
|
61
|
+
pretrained_model_name_or_path, **kwargs
|
62
|
+
)
|
63
|
+
|
64
|
+
if config_dict.get("model_type") == "qwen2_vl":
|
65
|
+
config_dict = config_dict["vision_config"]
|
66
|
+
|
67
|
+
return cls.from_dict(config_dict, **kwargs)
|
68
|
+
|
69
|
+
|
70
|
+
class Qwen2VLConfig(PretrainedConfig):
|
71
|
+
model_type = "qwen2_vl"
|
72
|
+
|
73
|
+
def __init__(
|
74
|
+
self,
|
75
|
+
vocab_size=152064,
|
76
|
+
hidden_size=8192,
|
77
|
+
intermediate_size=29568,
|
78
|
+
num_hidden_layers=80,
|
79
|
+
num_attention_heads=64,
|
80
|
+
num_key_value_heads=8,
|
81
|
+
hidden_act="silu",
|
82
|
+
max_position_embeddings=32768,
|
83
|
+
initializer_range=0.02,
|
84
|
+
rms_norm_eps=1e-05,
|
85
|
+
use_cache=True,
|
86
|
+
tie_word_embeddings=False,
|
87
|
+
rope_theta=1000000.0,
|
88
|
+
use_sliding_window=False,
|
89
|
+
sliding_window=4096,
|
90
|
+
max_window_layers=80,
|
91
|
+
attention_dropout=0.0,
|
92
|
+
vision_config=None,
|
93
|
+
rope_scaling=None,
|
94
|
+
**kwargs,
|
95
|
+
):
|
96
|
+
if isinstance(vision_config, dict):
|
97
|
+
self.vision_config = Qwen2VLVisionConfig(**vision_config)
|
98
|
+
elif vision_config is None:
|
99
|
+
self.vision_config = Qwen2VLVisionConfig()
|
100
|
+
|
101
|
+
self.vocab_size = vocab_size
|
102
|
+
self.max_position_embeddings = max_position_embeddings
|
103
|
+
self.hidden_size = hidden_size
|
104
|
+
self.intermediate_size = intermediate_size
|
105
|
+
self.num_hidden_layers = num_hidden_layers
|
106
|
+
self.num_attention_heads = num_attention_heads
|
107
|
+
self.use_sliding_window = use_sliding_window
|
108
|
+
self.sliding_window = sliding_window
|
109
|
+
self.max_window_layers = max_window_layers
|
110
|
+
|
111
|
+
# for backward compatibility
|
112
|
+
if num_key_value_heads is None:
|
113
|
+
num_key_value_heads = num_attention_heads
|
114
|
+
|
115
|
+
self.num_key_value_heads = num_key_value_heads
|
116
|
+
self.hidden_act = hidden_act
|
117
|
+
self.initializer_range = initializer_range
|
118
|
+
self.rms_norm_eps = rms_norm_eps
|
119
|
+
self.use_cache = use_cache
|
120
|
+
self.rope_theta = rope_theta
|
121
|
+
self.attention_dropout = attention_dropout
|
122
|
+
self.rope_scaling = rope_scaling
|
123
|
+
|
124
|
+
# NOTE: the following section from original transformers config
|
125
|
+
# for Qwen2-VL is commented out to address rope config loading issue
|
126
|
+
#
|
127
|
+
# if self.rope_scaling is not None and "type" in self.rope_scaling:
|
128
|
+
# if self.rope_scaling["type"] == "mrope":
|
129
|
+
# self.rope_scaling["type"] = "default"
|
130
|
+
# self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
131
|
+
# rope_config_validation(self)
|
132
|
+
|
133
|
+
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
@@ -73,9 +73,16 @@ class FSMCache(BaseToolCache):
|
|
73
73
|
def init_value(self, key):
|
74
74
|
key_type, key_string = key
|
75
75
|
if key_type == "json":
|
76
|
-
|
77
|
-
|
78
|
-
|
76
|
+
try:
|
77
|
+
regex = build_regex_from_schema(
|
78
|
+
key_string,
|
79
|
+
whitespace_pattern=self.constrained_json_whitespace_pattern,
|
80
|
+
)
|
81
|
+
except NotImplementedError as e:
|
82
|
+
logger.warning(
|
83
|
+
f"skip invalid json schema: json_schema={key_string}, {e=}"
|
84
|
+
)
|
85
|
+
return None, key_string
|
79
86
|
elif key_type == "regex":
|
80
87
|
regex = key_string
|
81
88
|
else:
|
@@ -509,6 +509,19 @@ register_conv_template(
|
|
509
509
|
)
|
510
510
|
)
|
511
511
|
|
512
|
+
register_conv_template(
|
513
|
+
Conversation(
|
514
|
+
name="llama_3_vision",
|
515
|
+
system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
|
516
|
+
system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
|
517
|
+
roles=("user", "assistant"),
|
518
|
+
sep_style=SeparatorStyle.LLAMA3,
|
519
|
+
sep="",
|
520
|
+
stop_str=["<|end_of_text|>", "<|eot_id|>"],
|
521
|
+
image_token="<|image|>",
|
522
|
+
)
|
523
|
+
)
|
524
|
+
|
512
525
|
register_conv_template(
|
513
526
|
Conversation(
|
514
527
|
name="llava_llama_3",
|
@@ -530,3 +543,17 @@ register_conv_template(
|
|
530
543
|
stop_str=["<|im_end|>", "<|action_end|>"],
|
531
544
|
)
|
532
545
|
)
|
546
|
+
|
547
|
+
# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
|
548
|
+
register_conv_template(
|
549
|
+
Conversation(
|
550
|
+
name="qwen2-vl",
|
551
|
+
system_message="You are a helpful assistant.",
|
552
|
+
system_template="<|im_start|>system\n{system_message}",
|
553
|
+
roles=("<|im_start|>user", "<|im_start|>assistant"),
|
554
|
+
sep="<|im_end|>\n",
|
555
|
+
sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
|
556
|
+
stop_str=["<|im_end|>"],
|
557
|
+
image_token="<|vision_start|><|image_pad|><|vision_end|>",
|
558
|
+
)
|
559
|
+
)
|