vllm-npu 0.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vllm/__init__.py +23 -0
- vllm/_custom_ops.py +251 -0
- vllm/attention/__init__.py +13 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +127 -0
- vllm/attention/backends/flash_attn.py +271 -0
- vllm/attention/backends/flashinfer.py +220 -0
- vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm/attention/backends/torch_sdpa.py +250 -0
- vllm/attention/backends/xformers.py +393 -0
- vllm/attention/layer.py +56 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/paged_attn.py +216 -0
- vllm/attention/ops/prefix_prefill.py +792 -0
- vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm/attention/selector.py +91 -0
- vllm/block.py +84 -0
- vllm/config.py +1225 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +295 -0
- vllm/core/block/common.py +199 -0
- vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm/core/block/interfaces.py +205 -0
- vllm/core/block/naive_block.py +318 -0
- vllm/core/block/prefix_caching_block.py +606 -0
- vllm/core/block_manager_v1.py +625 -0
- vllm/core/block_manager_v2.py +258 -0
- vllm/core/evictor_v1.py +105 -0
- vllm/core/evictor_v2.py +127 -0
- vllm/core/interfaces.py +113 -0
- vllm/core/policy.py +45 -0
- vllm/core/scheduler.py +1163 -0
- vllm/distributed/__init__.py +3 -0
- vllm/distributed/communication_op.py +237 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm/distributed/parallel_state.py +339 -0
- vllm/distributed/utils.py +136 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +649 -0
- vllm/engine/async_llm_engine.py +737 -0
- vllm/engine/llm_engine.py +784 -0
- vllm/engine/metrics.py +368 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +76 -0
- vllm/engine/output_processor/multi_step.py +142 -0
- vllm/engine/output_processor/single_step.py +284 -0
- vllm/engine/output_processor/stop_checker.py +101 -0
- vllm/engine/output_processor/util.py +19 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +119 -0
- vllm/entrypoints/llm.py +259 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +186 -0
- vllm/entrypoints/openai/cli_args.py +115 -0
- vllm/entrypoints/openai/protocol.py +460 -0
- vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm/envs.py +217 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/cpu_executor.py +152 -0
- vllm/executor/distributed_gpu_executor.py +115 -0
- vllm/executor/executor_base.py +115 -0
- vllm/executor/gpu_executor.py +150 -0
- vllm/executor/multiproc_worker_utils.py +263 -0
- vllm/executor/neuron_executor.py +91 -0
- vllm/executor/ray_gpu_executor.py +327 -0
- vllm/executor/ray_utils.py +119 -0
- vllm/logger.py +153 -0
- vllm/logging/__init__.py +5 -0
- vllm/logging/formatter.py +15 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +262 -0
- vllm/lora/layers.py +1181 -0
- vllm/lora/lora.py +167 -0
- vllm/lora/models.py +645 -0
- vllm/lora/punica.py +213 -0
- vllm/lora/request.py +32 -0
- vllm/lora/utils.py +98 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +7 -0
- vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +173 -0
- vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm/model_executor/layers/layernorm.py +71 -0
- vllm/model_executor/layers/linear.py +709 -0
- vllm/model_executor/layers/logits_processor.py +115 -0
- vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm/model_executor/layers/ops/rand.py +157 -0
- vllm/model_executor/layers/ops/sample.py +406 -0
- vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm/model_executor/layers/sampler.py +1051 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm/model_executor/model_loader/__init__.py +30 -0
- vllm/model_executor/model_loader/loader.py +362 -0
- vllm/model_executor/model_loader/neuron.py +136 -0
- vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm/model_executor/model_loader/utils.py +41 -0
- vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm/model_executor/models/__init__.py +119 -0
- vllm/model_executor/models/baichuan.py +410 -0
- vllm/model_executor/models/bloom.py +327 -0
- vllm/model_executor/models/chatglm.py +386 -0
- vllm/model_executor/models/commandr.py +373 -0
- vllm/model_executor/models/dbrx.py +413 -0
- vllm/model_executor/models/decilm.py +122 -0
- vllm/model_executor/models/deepseek.py +438 -0
- vllm/model_executor/models/falcon.py +444 -0
- vllm/model_executor/models/gemma.py +393 -0
- vllm/model_executor/models/gpt2.py +266 -0
- vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm/model_executor/models/gpt_j.py +281 -0
- vllm/model_executor/models/gpt_neox.py +295 -0
- vllm/model_executor/models/internlm2.py +323 -0
- vllm/model_executor/models/jais.py +333 -0
- vllm/model_executor/models/llama.py +442 -0
- vllm/model_executor/models/llava.py +239 -0
- vllm/model_executor/models/minicpm.py +531 -0
- vllm/model_executor/models/mixtral.py +583 -0
- vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm/model_executor/models/mpt.py +295 -0
- vllm/model_executor/models/olmo.py +356 -0
- vllm/model_executor/models/opt.py +349 -0
- vllm/model_executor/models/orion.py +319 -0
- vllm/model_executor/models/phi.py +300 -0
- vllm/model_executor/models/qwen.py +284 -0
- vllm/model_executor/models/qwen2.py +367 -0
- vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm/model_executor/models/stablelm.py +301 -0
- vllm/model_executor/models/starcoder2.py +302 -0
- vllm/model_executor/models/xverse.py +366 -0
- vllm/model_executor/sampling_metadata.py +588 -0
- vllm/model_executor/utils.py +35 -0
- vllm/outputs.py +150 -0
- vllm/py.typed +2 -0
- vllm/sampling_params.py +340 -0
- vllm/sequence.py +766 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +397 -0
- vllm/spec_decode/interfaces.py +73 -0
- vllm/spec_decode/metrics.py +191 -0
- vllm/spec_decode/multi_step_worker.py +203 -0
- vllm/spec_decode/ngram_worker.py +176 -0
- vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm/spec_decode/top1_proposer.py +200 -0
- vllm/spec_decode/util.py +228 -0
- vllm/test_utils.py +41 -0
- vllm/transformers_utils/__init__.py +0 -0
- vllm/transformers_utils/config.py +58 -0
- vllm/transformers_utils/configs/__init__.py +16 -0
- vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm/transformers_utils/configs/falcon.py +87 -0
- vllm/transformers_utils/configs/jais.py +236 -0
- vllm/transformers_utils/configs/mpt.py +178 -0
- vllm/transformers_utils/detokenizer.py +313 -0
- vllm/transformers_utils/tokenizer.py +149 -0
- vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +209 -0
- vllm/utils.py +677 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +105 -0
- vllm/worker/cpu_model_runner.py +346 -0
- vllm/worker/cpu_worker.py +321 -0
- vllm/worker/model_runner.py +1168 -0
- vllm/worker/neuron_model_runner.py +196 -0
- vllm/worker/neuron_worker.py +98 -0
- vllm/worker/worker.py +345 -0
- vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2.dist-info/LICENSE +201 -0
- vllm_npu-0.4.2.dist-info/METADATA +173 -0
- vllm_npu-0.4.2.dist-info/RECORD +219 -0
- vllm_npu-0.4.2.dist-info/WHEEL +5 -0
- vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
import asyncio
|
2
|
+
import concurrent.futures
|
3
|
+
from copy import copy
|
4
|
+
from enum import Enum
|
5
|
+
from functools import lru_cache
|
6
|
+
from json import dumps as json_dumps
|
7
|
+
from re import escape as regex_escape
|
8
|
+
from typing import Tuple, Union
|
9
|
+
|
10
|
+
from pydantic import BaseModel
|
11
|
+
from transformers import PreTrainedTokenizerBase
|
12
|
+
|
13
|
+
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
14
|
+
CompletionRequest)
|
15
|
+
from vllm.model_executor.guided_decoding.outlines_logits_processors import (
|
16
|
+
CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
|
17
|
+
|
18
|
+
|
19
|
+
class GuidedDecodingMode(Enum):
|
20
|
+
JSON = "json"
|
21
|
+
REGEX = "regex"
|
22
|
+
CHOICE = "choice"
|
23
|
+
GRAMMAR = "grammar"
|
24
|
+
|
25
|
+
|
26
|
+
# https://github.com/outlines-dev/outlines/blob/main/outlines/grammars/json.lark
|
27
|
+
# the main difference is that we changed the start: value to
|
28
|
+
# start: object | array, so we are denying scalar values as the root of the
|
29
|
+
# JSON. Starting with scalars as the root seems to cause llama to generate
|
30
|
+
# without stop.
|
31
|
+
JSON_GRAMMAR = r"""
|
32
|
+
?start: object | array
|
33
|
+
|
34
|
+
?value: object
|
35
|
+
| array
|
36
|
+
| UNESCAPED_STRING
|
37
|
+
| SIGNED_NUMBER -> number
|
38
|
+
| "true" -> true
|
39
|
+
| "false" -> false
|
40
|
+
| "null" -> null
|
41
|
+
|
42
|
+
array : "[" [value ("," value)*] "]"
|
43
|
+
object : "{" [pair ("," pair)*] "}"
|
44
|
+
pair : UNESCAPED_STRING ":" value
|
45
|
+
|
46
|
+
%import common.UNESCAPED_STRING
|
47
|
+
%import common.SIGNED_NUMBER
|
48
|
+
%import common.WS
|
49
|
+
|
50
|
+
%ignore WS
|
51
|
+
"""
|
52
|
+
|
53
|
+
global_thread_pool = None # used for generating logits processor fsm
|
54
|
+
|
55
|
+
|
56
|
+
async def get_outlines_guided_decoding_logits_processor(
|
57
|
+
request: Union[CompletionRequest, ChatCompletionRequest],
|
58
|
+
tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
|
59
|
+
"""
|
60
|
+
Given an OpenAI-compatible request, check for guided decoding parameters
|
61
|
+
and get the necessary logits processor for the given guide.
|
62
|
+
We cache logit processors by (guide, tokenizer), and on cache hit
|
63
|
+
we make a shallow copy to reuse the same underlying FSM.
|
64
|
+
"""
|
65
|
+
global global_thread_pool
|
66
|
+
guide, mode = _get_guide_and_mode(request)
|
67
|
+
if not guide:
|
68
|
+
return None
|
69
|
+
|
70
|
+
if global_thread_pool is None:
|
71
|
+
global_thread_pool = concurrent.futures.ThreadPoolExecutor(
|
72
|
+
max_workers=2)
|
73
|
+
loop = asyncio.get_running_loop()
|
74
|
+
|
75
|
+
result = await loop.run_in_executor(global_thread_pool,
|
76
|
+
_get_cached_logits_processor, guide,
|
77
|
+
tokenizer, mode,
|
78
|
+
request.guided_whitespace_pattern)
|
79
|
+
|
80
|
+
logits_processor = copy(result)
|
81
|
+
# reset logits processor's internal state
|
82
|
+
logits_processor.init_state()
|
83
|
+
return logits_processor
|
84
|
+
|
85
|
+
|
86
|
+
def _get_guide_and_mode(
|
87
|
+
request: Union[CompletionRequest, ChatCompletionRequest]
|
88
|
+
) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
|
89
|
+
|
90
|
+
if request.guided_json:
|
91
|
+
json = request.guided_json
|
92
|
+
if isinstance(json, dict):
|
93
|
+
# turn dict into hashable string
|
94
|
+
json = json_dumps(json)
|
95
|
+
elif isinstance(json, BaseModel):
|
96
|
+
# use pydantic signature so that different model classes
|
97
|
+
# with the same fields will get hashed the same
|
98
|
+
json = str(json.__signature__)
|
99
|
+
return json, GuidedDecodingMode.JSON
|
100
|
+
elif request.guided_regex:
|
101
|
+
return request.guided_regex, GuidedDecodingMode.REGEX
|
102
|
+
elif request.guided_choice:
|
103
|
+
# choice just uses regex
|
104
|
+
choices = [
|
105
|
+
regex_escape(str(choice)) for choice in request.guided_choice
|
106
|
+
]
|
107
|
+
choices_regex = "(" + "|".join(choices) + ")"
|
108
|
+
return choices_regex, GuidedDecodingMode.CHOICE
|
109
|
+
elif request.guided_grammar:
|
110
|
+
return request.guided_grammar, GuidedDecodingMode.GRAMMAR
|
111
|
+
elif (request.response_format is not None
|
112
|
+
and request.response_format.type == "json_object"):
|
113
|
+
return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
|
114
|
+
else:
|
115
|
+
return None, None
|
116
|
+
|
117
|
+
|
118
|
+
@lru_cache(maxsize=32)
|
119
|
+
def _get_cached_logits_processor(guide: str,
|
120
|
+
tokenizer: PreTrainedTokenizerBase,
|
121
|
+
mode: GuidedDecodingMode,
|
122
|
+
whitespace_pattern: Union[str, None]):
|
123
|
+
if mode == GuidedDecodingMode.JSON:
|
124
|
+
return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
|
125
|
+
elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
|
126
|
+
return RegexLogitsProcessor(guide, tokenizer)
|
127
|
+
elif mode == GuidedDecodingMode.GRAMMAR:
|
128
|
+
return CFGLogitsProcessor(guide, tokenizer)
|
129
|
+
else:
|
130
|
+
raise ValueError(f"Unknown guided decoding mode {mode}")
|
@@ -0,0 +1,184 @@
|
|
1
|
+
# Copyright 2024- the Outlines developers
|
2
|
+
# This file is adapted from
|
3
|
+
# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
import copy
|
17
|
+
import json
|
18
|
+
import math
|
19
|
+
from collections import defaultdict
|
20
|
+
from functools import lru_cache
|
21
|
+
from typing import Callable, DefaultDict, Dict, List, Union
|
22
|
+
|
23
|
+
import torch
|
24
|
+
from outlines.fsm.fsm import CFGFSM, FSM, RegexFSM
|
25
|
+
from outlines.fsm.json_schema import build_regex_from_schema
|
26
|
+
from pydantic import BaseModel
|
27
|
+
from transformers import PreTrainedTokenizerBase
|
28
|
+
|
29
|
+
|
30
|
+
class BaseLogitsProcessor:
|
31
|
+
|
32
|
+
def __init__(self):
|
33
|
+
# Child class should use initialize in their init.
|
34
|
+
self.fsm: FSM
|
35
|
+
|
36
|
+
def init_state(self):
|
37
|
+
"""Initialize the FSM states."""
|
38
|
+
self.fsm_state: DefaultDict[int, int] = defaultdict(int)
|
39
|
+
|
40
|
+
def __call__(self, input_ids: List[int],
|
41
|
+
scores: torch.Tensor) -> torch.Tensor:
|
42
|
+
"""Use the FSM to bias the logits before sampling the next token."""
|
43
|
+
seq_id = hash(tuple(input_ids))
|
44
|
+
|
45
|
+
if len(input_ids) == 0:
|
46
|
+
self.init_state()
|
47
|
+
else:
|
48
|
+
last_token = input_ids[-1]
|
49
|
+
last_seq_id = hash(tuple(input_ids[:-1]))
|
50
|
+
self.fsm_state[seq_id] = self.fsm.next_state(
|
51
|
+
self.fsm_state[last_seq_id], last_token)
|
52
|
+
|
53
|
+
allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
|
54
|
+
|
55
|
+
mask = torch.full((scores.shape[-1], ),
|
56
|
+
-math.inf,
|
57
|
+
device=scores.device)
|
58
|
+
mask[allowed_tokens] = 0
|
59
|
+
scores.add_(mask)
|
60
|
+
return scores
|
61
|
+
|
62
|
+
|
63
|
+
class RegexLogitsProcessor(BaseLogitsProcessor):
|
64
|
+
|
65
|
+
def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
|
66
|
+
"""Compile the FSM that drives the regex-structured generation.
|
67
|
+
|
68
|
+
Parameters
|
69
|
+
----------
|
70
|
+
regex_string
|
71
|
+
A string that represents a regular expression
|
72
|
+
tokenizer
|
73
|
+
The model's tokenizer
|
74
|
+
|
75
|
+
"""
|
76
|
+
tokenizer = _adapt_tokenizer(tokenizer)
|
77
|
+
fsm = RegexFSM(regex_string, tokenizer)
|
78
|
+
self.fsm = fsm
|
79
|
+
|
80
|
+
|
81
|
+
class JSONLogitsProcessor(RegexLogitsProcessor):
|
82
|
+
|
83
|
+
def __init__(self, schema: Union[str, Dict, BaseModel],
|
84
|
+
tokenizer: PreTrainedTokenizerBase,
|
85
|
+
whitespace_pattern: Union[str, None]):
|
86
|
+
"""Compile the FSM that drives the JSON-guided generation.
|
87
|
+
|
88
|
+
Parameters
|
89
|
+
----------
|
90
|
+
schema
|
91
|
+
A JSON schema that encodes the structure we want the model to
|
92
|
+
generate
|
93
|
+
tokenizer
|
94
|
+
The model's tokenizer
|
95
|
+
whitespace_pattern
|
96
|
+
Pattern to use for JSON syntactic whitespace (doesn't impact
|
97
|
+
string literals)
|
98
|
+
Example: allow only a single space or newline with
|
99
|
+
`whitespace_pattern=r"[\n ]?"`
|
100
|
+
"""
|
101
|
+
if isinstance(schema, type(BaseModel)):
|
102
|
+
schema_str = json.dumps(schema.model_json_schema())
|
103
|
+
elif isinstance(schema, Dict):
|
104
|
+
schema_str = json.dumps(schema)
|
105
|
+
elif isinstance(schema, str):
|
106
|
+
schema_str = schema
|
107
|
+
else:
|
108
|
+
raise ValueError(
|
109
|
+
f"Cannot parse schema {schema}. The schema must be either "
|
110
|
+
f"a Pydantic object, a dictionary or a string that contains "
|
111
|
+
f"the JSON Schema specification")
|
112
|
+
regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
|
113
|
+
super().__init__(regex_string, tokenizer)
|
114
|
+
|
115
|
+
|
116
|
+
class CFGLogitsProcessor(BaseLogitsProcessor):
|
117
|
+
|
118
|
+
def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
|
119
|
+
"""Compile the FSM that drives the context free grammar generation.
|
120
|
+
|
121
|
+
Parameters
|
122
|
+
----------
|
123
|
+
cfg
|
124
|
+
A string that represents a context-free grammar
|
125
|
+
tokenizer
|
126
|
+
The model's tokenizer
|
127
|
+
|
128
|
+
"""
|
129
|
+
tokenizer = _adapt_tokenizer(tokenizer)
|
130
|
+
fsm = CFGFSM(cfg, tokenizer)
|
131
|
+
self.fsm = fsm
|
132
|
+
|
133
|
+
def init_state(self):
|
134
|
+
"""Initialize state with a CFGFSM copy."""
|
135
|
+
super().init_state()
|
136
|
+
self.fsm = self.fsm.copy()
|
137
|
+
|
138
|
+
|
139
|
+
@lru_cache
|
140
|
+
def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
|
141
|
+
"""Adapt vLLM's tokenizer to use to compile the FSM.
|
142
|
+
|
143
|
+
The API of Outlines tokenizers is slightly different to that of
|
144
|
+
`transformers`. The decoder of outlines, returns a list whereas
|
145
|
+
the decode of vLLM returns an str. To sync the vLLM decoder with
|
146
|
+
outlines internal api, the decoder should be adapted. In addition
|
147
|
+
we need to handle the missing spaces to Llama's tokenizer to be
|
148
|
+
able to compile FSMs for this model.
|
149
|
+
|
150
|
+
"""
|
151
|
+
if getattr(tokenizer, "_outlines_adapted", False):
|
152
|
+
return tokenizer
|
153
|
+
|
154
|
+
tokenizer = copy.deepcopy(tokenizer)
|
155
|
+
|
156
|
+
tokenizer.vocabulary = tokenizer.get_vocab()
|
157
|
+
tokenizer.special_tokens = set(tokenizer.all_special_tokens)
|
158
|
+
|
159
|
+
def convert_token_to_string(token: str) -> str:
|
160
|
+
from transformers.file_utils import SPIECE_UNDERLINE
|
161
|
+
|
162
|
+
string = tokenizer.convert_tokens_to_string([token])
|
163
|
+
|
164
|
+
# A hack to handle missing spaces to HF's Llama tokenizers
|
165
|
+
if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
|
166
|
+
return " " + string
|
167
|
+
|
168
|
+
return string
|
169
|
+
|
170
|
+
def change_decoder(
|
171
|
+
decoder: Callable[[List[int]],
|
172
|
+
str]) -> Callable[[List[int]], List[str]]:
|
173
|
+
"""Sync vLLM's decoder with the outlines by returning list."""
|
174
|
+
|
175
|
+
def new_decoder(inp_tokens: List[int]) -> List[str]:
|
176
|
+
return [decoder(inp_tokens)]
|
177
|
+
|
178
|
+
return new_decoder
|
179
|
+
|
180
|
+
tokenizer.convert_token_to_string = convert_token_to_string
|
181
|
+
tokenizer.decode = change_decoder(tokenizer.decode)
|
182
|
+
setattr(tokenizer, "_outlines_adapted", True) # noqa: B010
|
183
|
+
|
184
|
+
return tokenizer
|
File without changes
|
@@ -0,0 +1,173 @@
|
|
1
|
+
"""Custom activation functions."""
|
2
|
+
import math
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
import torch
|
6
|
+
import torch.nn as nn
|
7
|
+
import torch.nn.functional as F
|
8
|
+
|
9
|
+
from vllm import _custom_ops as ops
|
10
|
+
from vllm.distributed import (divide, get_tensor_model_parallel_rank,
|
11
|
+
get_tensor_model_parallel_world_size)
|
12
|
+
from vllm.model_executor.layers.quantization import QuantizationConfig
|
13
|
+
from vllm.model_executor.utils import set_weight_attrs
|
14
|
+
|
15
|
+
|
16
|
+
class SiluAndMul(nn.Module):
|
17
|
+
"""An activation function for SwiGLU.
|
18
|
+
|
19
|
+
The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
|
20
|
+
|
21
|
+
Shapes:
|
22
|
+
x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
|
23
|
+
return: (num_tokens, d) or (batch_size, seq_len, d)
|
24
|
+
"""
|
25
|
+
|
26
|
+
def _forward(self, x: torch.Tensor) -> torch.Tensor:
|
27
|
+
"""PyTorch-native implementation equivalent to forward()."""
|
28
|
+
d = x.shape[-1] // 2
|
29
|
+
return F.silu(x[..., :d]) * x[..., d:]
|
30
|
+
|
31
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
32
|
+
d = x.shape[-1] // 2
|
33
|
+
output_shape = (x.shape[:-1] + (d, ))
|
34
|
+
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
35
|
+
ops.silu_and_mul(out, x)
|
36
|
+
return out
|
37
|
+
|
38
|
+
|
39
|
+
class GeluAndMul(nn.Module):
|
40
|
+
"""An activation function for GeGLU.
|
41
|
+
|
42
|
+
The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
|
43
|
+
|
44
|
+
Shapes:
|
45
|
+
x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
|
46
|
+
return: (batch_size, seq_len, d) or (num_tokens, d)
|
47
|
+
"""
|
48
|
+
|
49
|
+
def __init__(self, approximate: str = "none"):
|
50
|
+
super().__init__()
|
51
|
+
self.approximate = approximate
|
52
|
+
if approximate not in ("none", "tanh"):
|
53
|
+
raise ValueError(f"Unknown approximate mode: {approximate}")
|
54
|
+
|
55
|
+
def _forward(self, x: torch.Tensor) -> torch.Tensor:
|
56
|
+
"""PyTorch-native implementation equivalent to forward()."""
|
57
|
+
d = x.shape[-1] // 2
|
58
|
+
return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
|
59
|
+
|
60
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
61
|
+
d = x.shape[-1] // 2
|
62
|
+
output_shape = (x.shape[:-1] + (d, ))
|
63
|
+
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
64
|
+
if self.approximate == "none":
|
65
|
+
ops.gelu_and_mul(out, x)
|
66
|
+
elif self.approximate == "tanh":
|
67
|
+
ops.gelu_tanh_and_mul(out, x)
|
68
|
+
return out
|
69
|
+
|
70
|
+
def extra_repr(self) -> str:
|
71
|
+
return f'approximate={repr(self.approximate)}'
|
72
|
+
|
73
|
+
|
74
|
+
class NewGELU(nn.Module):
|
75
|
+
|
76
|
+
def _forward(self, x: torch.Tensor) -> torch.Tensor:
|
77
|
+
"""PyTorch-native implementation equivalent to forward()."""
|
78
|
+
c = math.sqrt(2.0 / math.pi)
|
79
|
+
return 0.5 * x * (1.0 + torch.tanh(c *
|
80
|
+
(x + 0.044715 * torch.pow(x, 3.0))))
|
81
|
+
|
82
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
83
|
+
out = torch.empty_like(x)
|
84
|
+
ops.gelu_new(out, x)
|
85
|
+
return out
|
86
|
+
|
87
|
+
|
88
|
+
class FastGELU(nn.Module):
|
89
|
+
|
90
|
+
def _forward(self, x: torch.Tensor) -> torch.Tensor:
|
91
|
+
"""PyTorch-native implementation equivalent to forward()."""
|
92
|
+
return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
|
93
|
+
(1.0 + 0.044715 * x * x)))
|
94
|
+
|
95
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
96
|
+
out = torch.empty_like(x)
|
97
|
+
ops.gelu_fast(out, x)
|
98
|
+
return out
|
99
|
+
|
100
|
+
|
101
|
+
class ScaledActivation(nn.Module):
|
102
|
+
"""An activation function with post-scale parameters.
|
103
|
+
|
104
|
+
This is used for some quantization methods like AWQ.
|
105
|
+
"""
|
106
|
+
|
107
|
+
def __init__(
|
108
|
+
self,
|
109
|
+
act_module: nn.Module,
|
110
|
+
intermediate_size: int,
|
111
|
+
input_is_parallel: bool = True,
|
112
|
+
params_dtype: Optional[torch.dtype] = None,
|
113
|
+
):
|
114
|
+
super().__init__()
|
115
|
+
self.act = act_module
|
116
|
+
self.input_is_parallel = input_is_parallel
|
117
|
+
if input_is_parallel:
|
118
|
+
tp_size = get_tensor_model_parallel_world_size()
|
119
|
+
intermediate_size_per_partition = divide(intermediate_size,
|
120
|
+
tp_size)
|
121
|
+
else:
|
122
|
+
intermediate_size_per_partition = intermediate_size
|
123
|
+
if params_dtype is None:
|
124
|
+
params_dtype = torch.get_default_dtype()
|
125
|
+
self.scales = nn.Parameter(
|
126
|
+
torch.empty(intermediate_size_per_partition, dtype=params_dtype))
|
127
|
+
set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
|
128
|
+
|
129
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
130
|
+
return self.act(x) / self.scales
|
131
|
+
|
132
|
+
def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
|
133
|
+
param_data = param.data
|
134
|
+
if self.input_is_parallel:
|
135
|
+
tp_rank = get_tensor_model_parallel_rank()
|
136
|
+
shard_size = param_data.shape[0]
|
137
|
+
start_idx = tp_rank * shard_size
|
138
|
+
loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
|
139
|
+
assert param_data.shape == loaded_weight.shape
|
140
|
+
param_data.copy_(loaded_weight)
|
141
|
+
|
142
|
+
|
143
|
+
_ACTIVATION_REGISTRY = {
|
144
|
+
"gelu": nn.GELU(),
|
145
|
+
"gelu_fast": FastGELU(),
|
146
|
+
"gelu_new": NewGELU(),
|
147
|
+
"gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
|
148
|
+
"relu": nn.ReLU(),
|
149
|
+
}
|
150
|
+
|
151
|
+
|
152
|
+
def get_act_fn(
|
153
|
+
act_fn_name: str,
|
154
|
+
quant_config: Optional[QuantizationConfig] = None,
|
155
|
+
intermediate_size: Optional[int] = None,
|
156
|
+
input_is_parallel: bool = True,
|
157
|
+
params_dtype: Optional[torch.dtype] = None,
|
158
|
+
) -> nn.Module:
|
159
|
+
"""Get an activation function by name."""
|
160
|
+
act_fn_name = act_fn_name.lower()
|
161
|
+
if act_fn_name not in _ACTIVATION_REGISTRY:
|
162
|
+
raise ValueError(
|
163
|
+
f"Activation function {act_fn_name!r} is not supported.")
|
164
|
+
|
165
|
+
act_fn = _ACTIVATION_REGISTRY[act_fn_name]
|
166
|
+
if (quant_config is not None
|
167
|
+
and act_fn_name in quant_config.get_scaled_act_names()):
|
168
|
+
if intermediate_size is None:
|
169
|
+
raise ValueError("intermediate_size must be specified for scaled "
|
170
|
+
"activation functions.")
|
171
|
+
return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
|
172
|
+
params_dtype)
|
173
|
+
return act_fn
|
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
{
|
2
|
+
"1": {
|
3
|
+
"BLOCK_SIZE_M": 16,
|
4
|
+
"BLOCK_SIZE_N": 32,
|
5
|
+
"BLOCK_SIZE_K": 64,
|
6
|
+
"GROUP_SIZE_M": 1,
|
7
|
+
"num_warps": 4,
|
8
|
+
"num_stages": 4
|
9
|
+
},
|
10
|
+
"2": {
|
11
|
+
"BLOCK_SIZE_M": 16,
|
12
|
+
"BLOCK_SIZE_N": 128,
|
13
|
+
"BLOCK_SIZE_K": 128,
|
14
|
+
"GROUP_SIZE_M": 16,
|
15
|
+
"num_warps": 4,
|
16
|
+
"num_stages": 4
|
17
|
+
},
|
18
|
+
"4": {
|
19
|
+
"BLOCK_SIZE_M": 16,
|
20
|
+
"BLOCK_SIZE_N": 64,
|
21
|
+
"BLOCK_SIZE_K": 64,
|
22
|
+
"GROUP_SIZE_M": 16,
|
23
|
+
"num_warps": 4,
|
24
|
+
"num_stages": 4
|
25
|
+
},
|
26
|
+
"8": {
|
27
|
+
"BLOCK_SIZE_M": 16,
|
28
|
+
"BLOCK_SIZE_N": 64,
|
29
|
+
"BLOCK_SIZE_K": 128,
|
30
|
+
"GROUP_SIZE_M": 64,
|
31
|
+
"num_warps": 4,
|
32
|
+
"num_stages": 4
|
33
|
+
},
|
34
|
+
"16": {
|
35
|
+
"BLOCK_SIZE_M": 16,
|
36
|
+
"BLOCK_SIZE_N": 128,
|
37
|
+
"BLOCK_SIZE_K": 128,
|
38
|
+
"GROUP_SIZE_M": 32,
|
39
|
+
"num_warps": 8,
|
40
|
+
"num_stages": 4
|
41
|
+
},
|
42
|
+
"24": {
|
43
|
+
"BLOCK_SIZE_M": 16,
|
44
|
+
"BLOCK_SIZE_N": 64,
|
45
|
+
"BLOCK_SIZE_K": 128,
|
46
|
+
"GROUP_SIZE_M": 1,
|
47
|
+
"num_warps": 4,
|
48
|
+
"num_stages": 4
|
49
|
+
},
|
50
|
+
"32": {
|
51
|
+
"BLOCK_SIZE_M": 16,
|
52
|
+
"BLOCK_SIZE_N": 64,
|
53
|
+
"BLOCK_SIZE_K": 256,
|
54
|
+
"GROUP_SIZE_M": 1,
|
55
|
+
"num_warps": 4,
|
56
|
+
"num_stages": 4
|
57
|
+
},
|
58
|
+
"48": {
|
59
|
+
"BLOCK_SIZE_M": 16,
|
60
|
+
"BLOCK_SIZE_N": 64,
|
61
|
+
"BLOCK_SIZE_K": 256,
|
62
|
+
"GROUP_SIZE_M": 1,
|
63
|
+
"num_warps": 4,
|
64
|
+
"num_stages": 4
|
65
|
+
},
|
66
|
+
"64": {
|
67
|
+
"BLOCK_SIZE_M": 16,
|
68
|
+
"BLOCK_SIZE_N": 64,
|
69
|
+
"BLOCK_SIZE_K": 256,
|
70
|
+
"GROUP_SIZE_M": 1,
|
71
|
+
"num_warps": 4,
|
72
|
+
"num_stages": 4
|
73
|
+
},
|
74
|
+
"96": {
|
75
|
+
"BLOCK_SIZE_M": 16,
|
76
|
+
"BLOCK_SIZE_N": 64,
|
77
|
+
"BLOCK_SIZE_K": 256,
|
78
|
+
"GROUP_SIZE_M": 64,
|
79
|
+
"num_warps": 4,
|
80
|
+
"num_stages": 4
|
81
|
+
},
|
82
|
+
"128": {
|
83
|
+
"BLOCK_SIZE_M": 32,
|
84
|
+
"BLOCK_SIZE_N": 128,
|
85
|
+
"BLOCK_SIZE_K": 128,
|
86
|
+
"GROUP_SIZE_M": 1,
|
87
|
+
"num_warps": 4,
|
88
|
+
"num_stages": 4
|
89
|
+
},
|
90
|
+
"256": {
|
91
|
+
"BLOCK_SIZE_M": 32,
|
92
|
+
"BLOCK_SIZE_N": 128,
|
93
|
+
"BLOCK_SIZE_K": 128,
|
94
|
+
"GROUP_SIZE_M": 32,
|
95
|
+
"num_warps": 4,
|
96
|
+
"num_stages": 4
|
97
|
+
},
|
98
|
+
"512": {
|
99
|
+
"BLOCK_SIZE_M": 32,
|
100
|
+
"BLOCK_SIZE_N": 128,
|
101
|
+
"BLOCK_SIZE_K": 128,
|
102
|
+
"GROUP_SIZE_M": 64,
|
103
|
+
"num_warps": 4,
|
104
|
+
"num_stages": 4
|
105
|
+
},
|
106
|
+
"1024": {
|
107
|
+
"BLOCK_SIZE_M": 128,
|
108
|
+
"BLOCK_SIZE_N": 128,
|
109
|
+
"BLOCK_SIZE_K": 64,
|
110
|
+
"GROUP_SIZE_M": 16,
|
111
|
+
"num_warps": 8,
|
112
|
+
"num_stages": 4
|
113
|
+
},
|
114
|
+
"1536": {
|
115
|
+
"BLOCK_SIZE_M": 128,
|
116
|
+
"BLOCK_SIZE_N": 128,
|
117
|
+
"BLOCK_SIZE_K": 64,
|
118
|
+
"GROUP_SIZE_M": 16,
|
119
|
+
"num_warps": 8,
|
120
|
+
"num_stages": 4
|
121
|
+
},
|
122
|
+
"2048": {
|
123
|
+
"BLOCK_SIZE_M": 128,
|
124
|
+
"BLOCK_SIZE_N": 128,
|
125
|
+
"BLOCK_SIZE_K": 64,
|
126
|
+
"GROUP_SIZE_M": 16,
|
127
|
+
"num_warps": 8,
|
128
|
+
"num_stages": 4
|
129
|
+
},
|
130
|
+
"3072": {
|
131
|
+
"BLOCK_SIZE_M": 128,
|
132
|
+
"BLOCK_SIZE_N": 128,
|
133
|
+
"BLOCK_SIZE_K": 64,
|
134
|
+
"GROUP_SIZE_M": 16,
|
135
|
+
"num_warps": 8,
|
136
|
+
"num_stages": 4
|
137
|
+
},
|
138
|
+
"4096": {
|
139
|
+
"BLOCK_SIZE_M": 128,
|
140
|
+
"BLOCK_SIZE_N": 128,
|
141
|
+
"BLOCK_SIZE_K": 64,
|
142
|
+
"GROUP_SIZE_M": 16,
|
143
|
+
"num_warps": 8,
|
144
|
+
"num_stages": 4
|
145
|
+
}
|
146
|
+
}
|