sglang 0.2.14.post1__tar.gz → 0.2.14.post2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.14.post1/sglang.egg-info → sglang-0.2.14.post2}/PKG-INFO +3 -3
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/README.md +2 -2
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/pyproject.toml +1 -1
- sglang-0.2.14.post2/sglang/launch_server_llavavid.py +26 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/hf_transformers_utils.py +0 -149
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/activation.py +10 -4
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/layernorm.py +47 -1
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/managers/io_struct.py +5 -4
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/managers/schedule_batch.py +5 -5
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/managers/tokenizer_manager.py +74 -61
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/managers/tp_worker.py +9 -10
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/model_executor/forward_batch_info.py +10 -20
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/model_executor/model_runner.py +15 -6
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/chatglm.py +1 -1
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/gemma.py +2 -2
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/gemma2.py +1 -51
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/grok.py +9 -3
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/llama2.py +3 -4
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/llama_classification.py +0 -4
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/llama_embedding.py +3 -4
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/llava.py +69 -91
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/llavavid.py +40 -86
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/qwen2.py +3 -4
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/qwen2_moe.py +7 -19
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/yivl.py +2 -7
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/server.py +3 -3
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/utils.py +18 -33
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/runners.py +1 -1
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/test_layernorm.py +53 -1
- sglang-0.2.14.post2/sglang/version.py +1 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2/sglang.egg-info}/PKG-INFO +3 -3
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang.egg-info/SOURCES.txt +1 -0
- sglang-0.2.14.post1/sglang/version.py +0 -1
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/LICENSE +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/setup.cfg +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/__init__.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/api.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/bench_latency.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/bench_serving.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/check_env.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/global_config.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/chat_template.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/choices.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/compiler.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/interpreter.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/ir.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/launch_server.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/conversation.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/decode_attention.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/extend_attention.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/prefill_attention.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/managers/controller_multi.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/managers/controller_single.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/managers/policy_scheduler.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/model_config.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/deepseek_v2.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/openai_api/adapter.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/srt/server_args.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/run_eval.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/test_activation.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/test_programs.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/test/test_utils.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang/utils.py +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.2.14.post1 → sglang-0.2.14.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.14.
|
3
|
+
Version: 0.2.14.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
312
312
|
### Method 2: From source
|
313
313
|
```
|
314
314
|
# Use the last release branch
|
315
|
-
git clone -b v0.2.14.
|
315
|
+
git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git
|
316
316
|
cd sglang
|
317
317
|
|
318
318
|
pip install --upgrade pip
|
@@ -496,7 +496,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
496
496
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
497
497
|
- DeepSeek / DeepSeek 2
|
498
498
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
499
|
-
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava
|
499
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
500
500
|
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
501
501
|
- LLaVA 1.5 / 1.6 / NeXT
|
502
502
|
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
@@ -56,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
56
56
|
### Method 2: From source
|
57
57
|
```
|
58
58
|
# Use the last release branch
|
59
|
-
git clone -b v0.2.14.
|
59
|
+
git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git
|
60
60
|
cd sglang
|
61
61
|
|
62
62
|
pip install --upgrade pip
|
@@ -240,7 +240,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
240
240
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
241
241
|
- DeepSeek / DeepSeek 2
|
242
242
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
243
|
-
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava
|
243
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
244
244
|
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
245
245
|
- LLaVA 1.5 / 1.6 / NeXT
|
246
246
|
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.14.
|
7
|
+
version = "0.2.14.post2"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -0,0 +1,26 @@
|
|
1
|
+
"""Launch the inference server for Llava-video model."""
|
2
|
+
|
3
|
+
import argparse
|
4
|
+
|
5
|
+
from sglang.srt.server import ServerArgs, launch_server
|
6
|
+
|
7
|
+
if __name__ == "__main__":
|
8
|
+
parser = argparse.ArgumentParser()
|
9
|
+
ServerArgs.add_cli_args(parser)
|
10
|
+
args = parser.parse_args()
|
11
|
+
server_args = ServerArgs.from_cli_args(args)
|
12
|
+
|
13
|
+
model_overide_args = {}
|
14
|
+
model_overide_args["mm_spatial_pool_stride"] = 2
|
15
|
+
model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
|
16
|
+
model_overide_args["num_frames"] = 16
|
17
|
+
model_overide_args["model_type"] = "llavavid"
|
18
|
+
if model_overide_args["num_frames"] == 32:
|
19
|
+
model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
|
20
|
+
model_overide_args["max_sequence_length"] = 4096 * 2
|
21
|
+
model_overide_args["tokenizer_model_max_length"] = 4096 * 2
|
22
|
+
model_overide_args["model_max_length"] = 4096 * 2
|
23
|
+
if "34b" in args.model_path.lower():
|
24
|
+
model_overide_args["image_token_index"] = 64002
|
25
|
+
|
26
|
+
launch_server(server_args, model_overide_args, None)
|
@@ -119,24 +119,7 @@ def get_tokenizer(
|
|
119
119
|
tokenizer_revision: Optional[str] = None,
|
120
120
|
**kwargs,
|
121
121
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
122
|
-
if tokenizer_name.endswith(".json"):
|
123
|
-
return TiktokenTokenizer(tokenizer_name)
|
124
|
-
|
125
|
-
if tokenizer_name.endswith(".model"):
|
126
|
-
return SentencePieceTokenizer(tokenizer_name)
|
127
|
-
|
128
122
|
"""Gets a tokenizer for the given model name via Huggingface."""
|
129
|
-
if is_multimodal_model(tokenizer_name):
|
130
|
-
processor = get_processor(
|
131
|
-
tokenizer_name,
|
132
|
-
*args,
|
133
|
-
trust_remote_code=trust_remote_code,
|
134
|
-
tokenizer_revision=tokenizer_revision,
|
135
|
-
**kwargs,
|
136
|
-
)
|
137
|
-
tokenizer = processor.tokenizer
|
138
|
-
return tokenizer
|
139
|
-
|
140
123
|
if tokenizer_mode == "slow":
|
141
124
|
if kwargs.get("use_fast", False):
|
142
125
|
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
@@ -199,135 +182,3 @@ def get_processor(
|
|
199
182
|
**kwargs,
|
200
183
|
)
|
201
184
|
return processor
|
202
|
-
|
203
|
-
|
204
|
-
class TiktokenTokenizer:
|
205
|
-
def __init__(self, tokenizer_path):
|
206
|
-
import tiktoken
|
207
|
-
from jinja2 import Template
|
208
|
-
|
209
|
-
PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
210
|
-
|
211
|
-
# Read JSON
|
212
|
-
name = "tmp-json"
|
213
|
-
with open(tokenizer_path, "rb") as fin:
|
214
|
-
tok_dict = json.load(fin)
|
215
|
-
|
216
|
-
mergeable_ranks = {
|
217
|
-
bytes(item["bytes"]): item["token"] for item in tok_dict["regular_tokens"]
|
218
|
-
}
|
219
|
-
special_tokens = {
|
220
|
-
bytes(item["bytes"]).decode(): item["token"]
|
221
|
-
for item in tok_dict["special_tokens"]
|
222
|
-
}
|
223
|
-
assert tok_dict["word_split"] == "V1"
|
224
|
-
|
225
|
-
default_allowed_special = None
|
226
|
-
|
227
|
-
kwargs = {
|
228
|
-
"name": name,
|
229
|
-
"pat_str": tok_dict.get("pat_str", PAT_STR_B),
|
230
|
-
"mergeable_ranks": mergeable_ranks,
|
231
|
-
"special_tokens": special_tokens,
|
232
|
-
}
|
233
|
-
if "default_allowed_special" in tok_dict:
|
234
|
-
default_allowed_special = set(
|
235
|
-
[
|
236
|
-
bytes(bytes_list).decode()
|
237
|
-
for bytes_list in tok_dict["default_allowed_special"]
|
238
|
-
]
|
239
|
-
)
|
240
|
-
if "vocab_size" in tok_dict:
|
241
|
-
kwargs["explicit_n_vocab"] = tok_dict["vocab_size"]
|
242
|
-
|
243
|
-
PAD = "<|pad|>"
|
244
|
-
EOS = "<|eos|>"
|
245
|
-
SEP = "<|separator|>"
|
246
|
-
|
247
|
-
DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP}
|
248
|
-
|
249
|
-
tokenizer = tiktoken.Encoding(**kwargs)
|
250
|
-
tokenizer._default_allowed_special = default_allowed_special or set()
|
251
|
-
tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS
|
252
|
-
|
253
|
-
def encode_patched(
|
254
|
-
self,
|
255
|
-
text: str,
|
256
|
-
*,
|
257
|
-
allowed_special: Union[
|
258
|
-
Literal["all"], AbstractSet[str]
|
259
|
-
] = set(), # noqa: B006
|
260
|
-
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
261
|
-
) -> List[int]:
|
262
|
-
if isinstance(allowed_special, set):
|
263
|
-
allowed_special |= self._default_allowed_special
|
264
|
-
return tiktoken.Encoding.encode(
|
265
|
-
self,
|
266
|
-
text,
|
267
|
-
allowed_special=allowed_special,
|
268
|
-
disallowed_special=(),
|
269
|
-
)
|
270
|
-
|
271
|
-
tokenizer.encode = functools.partial(encode_patched, tokenizer)
|
272
|
-
|
273
|
-
# Convert to HF interface
|
274
|
-
self.tokenizer = tokenizer
|
275
|
-
self.eos_token_id = tokenizer._special_tokens[EOS]
|
276
|
-
self.vocab_size = tokenizer.n_vocab
|
277
|
-
self.chat_template = Template(
|
278
|
-
"{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
|
279
|
-
)
|
280
|
-
|
281
|
-
def encode(self, x, add_special_tokens=False):
|
282
|
-
return self.tokenizer.encode(x)
|
283
|
-
|
284
|
-
def decode(self, x):
|
285
|
-
return self.tokenizer.decode(x)
|
286
|
-
|
287
|
-
def batch_decode(
|
288
|
-
self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
|
289
|
-
):
|
290
|
-
if isinstance(batch[0], int):
|
291
|
-
batch = [[x] for x in batch]
|
292
|
-
return self.tokenizer.decode_batch(batch)
|
293
|
-
|
294
|
-
def apply_chat_template(self, messages, tokenize, add_generation_prompt):
|
295
|
-
ret = self.chat_template.render(
|
296
|
-
messages=messages, add_generation_prompt=add_generation_prompt
|
297
|
-
)
|
298
|
-
return self.encode(ret) if tokenize else ret
|
299
|
-
|
300
|
-
|
301
|
-
class SentencePieceTokenizer:
|
302
|
-
def __init__(self, tokenizer_path):
|
303
|
-
import sentencepiece as spm
|
304
|
-
from jinja2 import Template
|
305
|
-
|
306
|
-
tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
|
307
|
-
|
308
|
-
# Convert to HF interface
|
309
|
-
self.tokenizer = tokenizer
|
310
|
-
self.eos_token_id = tokenizer.eos_id()
|
311
|
-
self.vocab_size = tokenizer.vocab_size()
|
312
|
-
self.chat_template = Template(
|
313
|
-
"{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
|
314
|
-
)
|
315
|
-
|
316
|
-
def encode(self, x, add_special_tokens=False):
|
317
|
-
return self.tokenizer.encode(x)
|
318
|
-
|
319
|
-
def decode(self, x):
|
320
|
-
return self.tokenizer.decode(x)
|
321
|
-
|
322
|
-
def batch_decode(
|
323
|
-
self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
|
324
|
-
):
|
325
|
-
if isinstance(batch[0], int):
|
326
|
-
batch = [[x] for x in batch]
|
327
|
-
return self.tokenizer.decode(batch)
|
328
|
-
|
329
|
-
def apply_chat_template(self, messages, tokenize, add_generation_prompt):
|
330
|
-
ret = self.chat_template.render(
|
331
|
-
messages=messages, add_generation_prompt=add_generation_prompt
|
332
|
-
)
|
333
|
-
return self.encode(ret) if tokenize else ret
|
@@ -18,7 +18,7 @@ from typing import Optional
|
|
18
18
|
import torch
|
19
19
|
import torch.nn as nn
|
20
20
|
import torch.nn.functional as F
|
21
|
-
from flashinfer.activation import gelu_tanh_and_mul, silu_and_mul
|
21
|
+
from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
|
22
22
|
from vllm.distributed import (
|
23
23
|
divide,
|
24
24
|
get_tensor_model_parallel_rank,
|
@@ -43,18 +43,24 @@ class SiluAndMul(CustomOp):
|
|
43
43
|
|
44
44
|
|
45
45
|
class GeluAndMul(CustomOp):
|
46
|
-
def __init__(self,
|
46
|
+
def __init__(self, approximate="tanh"):
|
47
47
|
super().__init__()
|
48
|
+
self.approximate = approximate
|
48
49
|
|
49
50
|
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
|
50
51
|
d = x.shape[-1] // 2
|
51
|
-
return F.gelu(x[..., :d], approximate=
|
52
|
+
return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
|
52
53
|
|
53
54
|
def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
|
54
55
|
d = x.shape[-1] // 2
|
55
56
|
output_shape = x.shape[:-1] + (d,)
|
56
57
|
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
57
|
-
|
58
|
+
if self.approximate == "tanh":
|
59
|
+
gelu_tanh_and_mul(x, out)
|
60
|
+
elif self.approximate == "none":
|
61
|
+
gelu_and_mul(x, out)
|
62
|
+
else:
|
63
|
+
raise RuntimeError("GeluAndMul only support tanh or none")
|
58
64
|
return out
|
59
65
|
|
60
66
|
|
@@ -19,7 +19,12 @@ from typing import Optional, Tuple, Union
|
|
19
19
|
|
20
20
|
import torch
|
21
21
|
import torch.nn as nn
|
22
|
-
from flashinfer.norm import
|
22
|
+
from flashinfer.norm import (
|
23
|
+
fused_add_rmsnorm,
|
24
|
+
gemma_fused_add_rmsnorm,
|
25
|
+
gemma_rmsnorm,
|
26
|
+
rmsnorm,
|
27
|
+
)
|
23
28
|
from vllm.model_executor.custom_op import CustomOp
|
24
29
|
|
25
30
|
|
@@ -63,3 +68,44 @@ class RMSNorm(CustomOp):
|
|
63
68
|
return x
|
64
69
|
else:
|
65
70
|
return x, residual
|
71
|
+
|
72
|
+
|
73
|
+
class GemmaRMSNorm(CustomOp):
|
74
|
+
def __init__(
|
75
|
+
self,
|
76
|
+
hidden_size: int,
|
77
|
+
eps: float = 1e-6,
|
78
|
+
) -> None:
|
79
|
+
super().__init__()
|
80
|
+
self.weight = nn.Parameter(torch.zeros(hidden_size))
|
81
|
+
self.variance_epsilon = eps
|
82
|
+
|
83
|
+
def forward_native(
|
84
|
+
self,
|
85
|
+
x: torch.Tensor,
|
86
|
+
residual: Optional[torch.Tensor] = None,
|
87
|
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
88
|
+
orig_dtype = x.dtype
|
89
|
+
if residual is not None:
|
90
|
+
x = x + residual
|
91
|
+
residual = x
|
92
|
+
|
93
|
+
x = x.float()
|
94
|
+
variance = x.pow(2).mean(dim=-1, keepdim=True)
|
95
|
+
x = x * torch.rsqrt(variance + self.variance_epsilon)
|
96
|
+
x = x * (1.0 + self.weight.float())
|
97
|
+
x = x.to(orig_dtype)
|
98
|
+
return x if residual is None else (x, residual)
|
99
|
+
|
100
|
+
def forward_cuda(
|
101
|
+
self,
|
102
|
+
x: torch.Tensor,
|
103
|
+
residual: Optional[torch.Tensor] = None,
|
104
|
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
105
|
+
if residual is not None:
|
106
|
+
gemma_fused_add_rmsnorm(
|
107
|
+
x, residual, self.weight.data, self.variance_epsilon
|
108
|
+
)
|
109
|
+
return x, residual
|
110
|
+
out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon)
|
111
|
+
return out
|
@@ -55,6 +55,7 @@ class GenerateReqInput:
|
|
55
55
|
self.text is not None and self.input_ids is not None
|
56
56
|
):
|
57
57
|
raise ValueError("Either text or input_ids should be provided.")
|
58
|
+
|
58
59
|
if (
|
59
60
|
isinstance(self.sampling_params, dict)
|
60
61
|
and self.sampling_params.get("n", 1) != 1
|
@@ -161,10 +162,10 @@ class TokenizedGenerateReqInput:
|
|
161
162
|
input_ids: List[int]
|
162
163
|
# The pixel values for input images
|
163
164
|
pixel_values: List[float]
|
164
|
-
# The hash of input images
|
165
|
-
|
166
|
-
# The image
|
167
|
-
|
165
|
+
# The hash values of input images
|
166
|
+
image_hashes: List[int]
|
167
|
+
# The image sizes
|
168
|
+
image_sizes: List[List[int]]
|
168
169
|
# The sampling parameters
|
169
170
|
sampling_params: SamplingParams
|
170
171
|
# Whether to return the logprobs
|
@@ -121,8 +121,8 @@ class Req:
|
|
121
121
|
|
122
122
|
# For vision input
|
123
123
|
self.pixel_values = None
|
124
|
-
self.
|
125
|
-
self.
|
124
|
+
self.image_sizes = None
|
125
|
+
self.image_offsets = None
|
126
126
|
self.pad_value = None
|
127
127
|
|
128
128
|
# Prefix info
|
@@ -600,12 +600,12 @@ class ScheduleBatch:
|
|
600
600
|
if req.pixel_values is not None:
|
601
601
|
(
|
602
602
|
req.origin_input_ids,
|
603
|
-
req.
|
603
|
+
req.image_offsets,
|
604
604
|
) = model_runner.model.pad_input_ids(
|
605
605
|
req.origin_input_ids_unpadded,
|
606
606
|
req.pad_value,
|
607
|
-
req.pixel_values
|
608
|
-
req.
|
607
|
+
req.pixel_values,
|
608
|
+
req.image_sizes,
|
609
609
|
)
|
610
610
|
|
611
611
|
jump_forward_reqs.append(req)
|