vllm-npu 0.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vllm/__init__.py +23 -0
- vllm/_custom_ops.py +251 -0
- vllm/attention/__init__.py +13 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +127 -0
- vllm/attention/backends/flash_attn.py +271 -0
- vllm/attention/backends/flashinfer.py +220 -0
- vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm/attention/backends/torch_sdpa.py +250 -0
- vllm/attention/backends/xformers.py +393 -0
- vllm/attention/layer.py +56 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/paged_attn.py +216 -0
- vllm/attention/ops/prefix_prefill.py +792 -0
- vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm/attention/selector.py +91 -0
- vllm/block.py +84 -0
- vllm/config.py +1225 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +295 -0
- vllm/core/block/common.py +199 -0
- vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm/core/block/interfaces.py +205 -0
- vllm/core/block/naive_block.py +318 -0
- vllm/core/block/prefix_caching_block.py +606 -0
- vllm/core/block_manager_v1.py +625 -0
- vllm/core/block_manager_v2.py +258 -0
- vllm/core/evictor_v1.py +105 -0
- vllm/core/evictor_v2.py +127 -0
- vllm/core/interfaces.py +113 -0
- vllm/core/policy.py +45 -0
- vllm/core/scheduler.py +1163 -0
- vllm/distributed/__init__.py +3 -0
- vllm/distributed/communication_op.py +237 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm/distributed/parallel_state.py +339 -0
- vllm/distributed/utils.py +136 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +649 -0
- vllm/engine/async_llm_engine.py +737 -0
- vllm/engine/llm_engine.py +784 -0
- vllm/engine/metrics.py +368 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +76 -0
- vllm/engine/output_processor/multi_step.py +142 -0
- vllm/engine/output_processor/single_step.py +284 -0
- vllm/engine/output_processor/stop_checker.py +101 -0
- vllm/engine/output_processor/util.py +19 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +119 -0
- vllm/entrypoints/llm.py +259 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +186 -0
- vllm/entrypoints/openai/cli_args.py +115 -0
- vllm/entrypoints/openai/protocol.py +460 -0
- vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm/envs.py +217 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/cpu_executor.py +152 -0
- vllm/executor/distributed_gpu_executor.py +115 -0
- vllm/executor/executor_base.py +115 -0
- vllm/executor/gpu_executor.py +150 -0
- vllm/executor/multiproc_worker_utils.py +263 -0
- vllm/executor/neuron_executor.py +91 -0
- vllm/executor/ray_gpu_executor.py +327 -0
- vllm/executor/ray_utils.py +119 -0
- vllm/logger.py +153 -0
- vllm/logging/__init__.py +5 -0
- vllm/logging/formatter.py +15 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +262 -0
- vllm/lora/layers.py +1181 -0
- vllm/lora/lora.py +167 -0
- vllm/lora/models.py +645 -0
- vllm/lora/punica.py +213 -0
- vllm/lora/request.py +32 -0
- vllm/lora/utils.py +98 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +7 -0
- vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +173 -0
- vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm/model_executor/layers/layernorm.py +71 -0
- vllm/model_executor/layers/linear.py +709 -0
- vllm/model_executor/layers/logits_processor.py +115 -0
- vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm/model_executor/layers/ops/rand.py +157 -0
- vllm/model_executor/layers/ops/sample.py +406 -0
- vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm/model_executor/layers/sampler.py +1051 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm/model_executor/model_loader/__init__.py +30 -0
- vllm/model_executor/model_loader/loader.py +362 -0
- vllm/model_executor/model_loader/neuron.py +136 -0
- vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm/model_executor/model_loader/utils.py +41 -0
- vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm/model_executor/models/__init__.py +119 -0
- vllm/model_executor/models/baichuan.py +410 -0
- vllm/model_executor/models/bloom.py +327 -0
- vllm/model_executor/models/chatglm.py +386 -0
- vllm/model_executor/models/commandr.py +373 -0
- vllm/model_executor/models/dbrx.py +413 -0
- vllm/model_executor/models/decilm.py +122 -0
- vllm/model_executor/models/deepseek.py +438 -0
- vllm/model_executor/models/falcon.py +444 -0
- vllm/model_executor/models/gemma.py +393 -0
- vllm/model_executor/models/gpt2.py +266 -0
- vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm/model_executor/models/gpt_j.py +281 -0
- vllm/model_executor/models/gpt_neox.py +295 -0
- vllm/model_executor/models/internlm2.py +323 -0
- vllm/model_executor/models/jais.py +333 -0
- vllm/model_executor/models/llama.py +442 -0
- vllm/model_executor/models/llava.py +239 -0
- vllm/model_executor/models/minicpm.py +531 -0
- vllm/model_executor/models/mixtral.py +583 -0
- vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm/model_executor/models/mpt.py +295 -0
- vllm/model_executor/models/olmo.py +356 -0
- vllm/model_executor/models/opt.py +349 -0
- vllm/model_executor/models/orion.py +319 -0
- vllm/model_executor/models/phi.py +300 -0
- vllm/model_executor/models/qwen.py +284 -0
- vllm/model_executor/models/qwen2.py +367 -0
- vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm/model_executor/models/stablelm.py +301 -0
- vllm/model_executor/models/starcoder2.py +302 -0
- vllm/model_executor/models/xverse.py +366 -0
- vllm/model_executor/sampling_metadata.py +588 -0
- vllm/model_executor/utils.py +35 -0
- vllm/outputs.py +150 -0
- vllm/py.typed +2 -0
- vllm/sampling_params.py +340 -0
- vllm/sequence.py +766 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +397 -0
- vllm/spec_decode/interfaces.py +73 -0
- vllm/spec_decode/metrics.py +191 -0
- vllm/spec_decode/multi_step_worker.py +203 -0
- vllm/spec_decode/ngram_worker.py +176 -0
- vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm/spec_decode/top1_proposer.py +200 -0
- vllm/spec_decode/util.py +228 -0
- vllm/test_utils.py +41 -0
- vllm/transformers_utils/__init__.py +0 -0
- vllm/transformers_utils/config.py +58 -0
- vllm/transformers_utils/configs/__init__.py +16 -0
- vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm/transformers_utils/configs/falcon.py +87 -0
- vllm/transformers_utils/configs/jais.py +236 -0
- vllm/transformers_utils/configs/mpt.py +178 -0
- vllm/transformers_utils/detokenizer.py +313 -0
- vllm/transformers_utils/tokenizer.py +149 -0
- vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +209 -0
- vllm/utils.py +677 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +105 -0
- vllm/worker/cpu_model_runner.py +346 -0
- vllm/worker/cpu_worker.py +321 -0
- vllm/worker/model_runner.py +1168 -0
- vllm/worker/neuron_model_runner.py +196 -0
- vllm/worker/neuron_worker.py +98 -0
- vllm/worker/worker.py +345 -0
- vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2.dist-info/LICENSE +201 -0
- vllm_npu-0.4.2.dist-info/METADATA +173 -0
- vllm_npu-0.4.2.dist-info/RECORD +219 -0
- vllm_npu-0.4.2.dist-info/WHEEL +5 -0
- vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,372 @@
|
|
1
|
+
"""Utilities for downloading and initializing model weights."""
|
2
|
+
import fnmatch
|
3
|
+
import glob
|
4
|
+
import hashlib
|
5
|
+
import json
|
6
|
+
import os
|
7
|
+
import tempfile
|
8
|
+
from collections import defaultdict
|
9
|
+
from typing import Any, Generator, Iterable, List, Optional, Tuple
|
10
|
+
|
11
|
+
import filelock
|
12
|
+
import huggingface_hub.constants
|
13
|
+
import numpy as np
|
14
|
+
import torch
|
15
|
+
from huggingface_hub import HfFileSystem, snapshot_download
|
16
|
+
from safetensors.torch import load_file, safe_open, save_file
|
17
|
+
from tqdm.auto import tqdm
|
18
|
+
|
19
|
+
from vllm.config import LoadConfig, ModelConfig
|
20
|
+
from vllm.logger import init_logger
|
21
|
+
from vllm.model_executor.layers.quantization import (QuantizationConfig,
|
22
|
+
get_quantization_config)
|
23
|
+
from vllm.model_executor.layers.quantization.schema import QuantParamSchema
|
24
|
+
|
25
|
+
logger = init_logger(__name__)
|
26
|
+
|
27
|
+
# use system-level temp directory for file locks, so that multiple users
|
28
|
+
# can share the same lock without error.
|
29
|
+
# lock files in the temp directory will be automatically deleted when the
|
30
|
+
# system reboots, so users will not complain about annoying lock files
|
31
|
+
temp_dir = tempfile.gettempdir()
|
32
|
+
|
33
|
+
|
34
|
+
def enable_hf_transfer():
|
35
|
+
"""automatically activates hf_transfer
|
36
|
+
"""
|
37
|
+
if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
|
38
|
+
try:
|
39
|
+
# enable hf hub transfer if available
|
40
|
+
import hf_transfer # type: ignore # noqa
|
41
|
+
huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
|
42
|
+
except ImportError:
|
43
|
+
pass
|
44
|
+
|
45
|
+
|
46
|
+
enable_hf_transfer()
|
47
|
+
|
48
|
+
|
49
|
+
class DisabledTqdm(tqdm):
|
50
|
+
|
51
|
+
def __init__(self, *args, **kwargs):
|
52
|
+
super().__init__(*args, **kwargs, disable=True)
|
53
|
+
|
54
|
+
|
55
|
+
def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
|
56
|
+
lock_dir = cache_dir or temp_dir
|
57
|
+
os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
|
58
|
+
model_name = model_name_or_path.replace("/", "-")
|
59
|
+
hash_name = hashlib.sha256(model_name.encode()).hexdigest()
|
60
|
+
# add hash to avoid conflict with old users' lock files
|
61
|
+
lock_file_name = hash_name + model_name + ".lock"
|
62
|
+
# mode 0o666 is required for the filelock to be shared across users
|
63
|
+
lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
|
64
|
+
mode=0o666)
|
65
|
+
return lock
|
66
|
+
|
67
|
+
|
68
|
+
def _shared_pointers(tensors):
|
69
|
+
ptrs = defaultdict(list)
|
70
|
+
for k, v in tensors.items():
|
71
|
+
ptrs[v.data_ptr()].append(k)
|
72
|
+
failing = []
|
73
|
+
for _, names in ptrs.items():
|
74
|
+
if len(names) > 1:
|
75
|
+
failing.append(names)
|
76
|
+
return failing
|
77
|
+
|
78
|
+
|
79
|
+
def convert_bin_to_safetensor_file(
|
80
|
+
pt_filename: str,
|
81
|
+
sf_filename: str,
|
82
|
+
) -> None:
|
83
|
+
loaded = torch.load(pt_filename, map_location="cpu")
|
84
|
+
if "state_dict" in loaded:
|
85
|
+
loaded = loaded["state_dict"]
|
86
|
+
shared = _shared_pointers(loaded)
|
87
|
+
for shared_weights in shared:
|
88
|
+
for name in shared_weights[1:]:
|
89
|
+
loaded.pop(name)
|
90
|
+
|
91
|
+
# For tensors to be contiguous
|
92
|
+
loaded = {k: v.contiguous() for k, v in loaded.items()}
|
93
|
+
|
94
|
+
dirname = os.path.dirname(sf_filename)
|
95
|
+
os.makedirs(dirname, exist_ok=True)
|
96
|
+
save_file(loaded, sf_filename, metadata={"format": "pt"})
|
97
|
+
|
98
|
+
# check file size
|
99
|
+
sf_size = os.stat(sf_filename).st_size
|
100
|
+
pt_size = os.stat(pt_filename).st_size
|
101
|
+
if (sf_size - pt_size) / pt_size > 0.01:
|
102
|
+
raise RuntimeError(f"""The file size different is more than 1%:
|
103
|
+
- {sf_filename}: {sf_size}
|
104
|
+
- {pt_filename}: {pt_size}
|
105
|
+
""")
|
106
|
+
|
107
|
+
# check if the tensors are the same
|
108
|
+
reloaded = load_file(sf_filename)
|
109
|
+
for k in loaded:
|
110
|
+
pt_tensor = loaded[k]
|
111
|
+
sf_tensor = reloaded[k]
|
112
|
+
if not torch.equal(pt_tensor, sf_tensor):
|
113
|
+
raise RuntimeError(f"The output tensors do not match for key {k}")
|
114
|
+
|
115
|
+
|
116
|
+
# TODO(woosuk): Move this to other place.
|
117
|
+
def get_quant_config(model_config: ModelConfig,
|
118
|
+
load_config: LoadConfig) -> QuantizationConfig:
|
119
|
+
quant_cls = get_quantization_config(model_config.quantization)
|
120
|
+
# Read the quantization config from the HF model config, if available.
|
121
|
+
hf_quant_config = getattr(model_config.hf_config, "quantization_config",
|
122
|
+
None)
|
123
|
+
if hf_quant_config is not None:
|
124
|
+
return quant_cls.from_config(hf_quant_config)
|
125
|
+
model_name_or_path = model_config.model
|
126
|
+
is_local = os.path.isdir(model_name_or_path)
|
127
|
+
if not is_local:
|
128
|
+
# Download the config files.
|
129
|
+
with get_lock(model_name_or_path, load_config.download_dir):
|
130
|
+
hf_folder = snapshot_download(
|
131
|
+
model_name_or_path,
|
132
|
+
revision=model_config.revision,
|
133
|
+
allow_patterns="*.json",
|
134
|
+
cache_dir=load_config.download_dir,
|
135
|
+
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
136
|
+
tqdm_class=DisabledTqdm,
|
137
|
+
)
|
138
|
+
else:
|
139
|
+
hf_folder = model_name_or_path
|
140
|
+
|
141
|
+
possible_config_filenames = quant_cls.get_config_filenames()
|
142
|
+
|
143
|
+
# If the quantization config is not found, use the default config.
|
144
|
+
if not possible_config_filenames:
|
145
|
+
return quant_cls()
|
146
|
+
|
147
|
+
config_files = glob.glob(os.path.join(hf_folder, "*.json"))
|
148
|
+
|
149
|
+
quant_config_files = [
|
150
|
+
f for f in config_files if any(
|
151
|
+
f.endswith(x) for x in possible_config_filenames)
|
152
|
+
]
|
153
|
+
if len(quant_config_files) == 0:
|
154
|
+
raise ValueError(
|
155
|
+
f"Cannot find the config file for {model_config.quantization}")
|
156
|
+
if len(quant_config_files) > 1:
|
157
|
+
raise ValueError(
|
158
|
+
f"Found multiple config files for {model_config.quantization}: "
|
159
|
+
f"{quant_config_files}")
|
160
|
+
|
161
|
+
quant_config_file = quant_config_files[0]
|
162
|
+
with open(quant_config_file, "r") as f:
|
163
|
+
config = json.load(f)
|
164
|
+
return quant_cls.from_config(config)
|
165
|
+
|
166
|
+
|
167
|
+
def download_weights_from_hf(
|
168
|
+
model_name_or_path: str,
|
169
|
+
cache_dir: Optional[str],
|
170
|
+
allow_patterns: List[str],
|
171
|
+
revision: Optional[str] = None,
|
172
|
+
) -> str:
|
173
|
+
"""Download model weights from Hugging Face Hub.
|
174
|
+
|
175
|
+
Args:
|
176
|
+
model_name_or_path (str): The model name or path.
|
177
|
+
cache_dir (Optional[str]): The cache directory to store the model
|
178
|
+
weights. If None, will use HF defaults.
|
179
|
+
allow_patterns (List[str]): The allowed patterns for the
|
180
|
+
weight files. Files matched by any of the patterns will be
|
181
|
+
downloaded.
|
182
|
+
revision (Optional[str]): The revision of the model.
|
183
|
+
|
184
|
+
Returns:
|
185
|
+
str: The path to the downloaded model weights.
|
186
|
+
"""
|
187
|
+
if not huggingface_hub.constants.HF_HUB_OFFLINE:
|
188
|
+
# Before we download we look at that is available:
|
189
|
+
fs = HfFileSystem()
|
190
|
+
file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
|
191
|
+
|
192
|
+
# depending on what is available we download different things
|
193
|
+
for pattern in allow_patterns:
|
194
|
+
matching = fnmatch.filter(file_list, pattern)
|
195
|
+
if len(matching) > 0:
|
196
|
+
allow_patterns = [pattern]
|
197
|
+
break
|
198
|
+
|
199
|
+
logger.info("Using model weights format %s", allow_patterns)
|
200
|
+
# Use file lock to prevent multiple processes from
|
201
|
+
# downloading the same model weights at the same time.
|
202
|
+
with get_lock(model_name_or_path, cache_dir):
|
203
|
+
hf_folder = snapshot_download(
|
204
|
+
model_name_or_path,
|
205
|
+
allow_patterns=allow_patterns,
|
206
|
+
cache_dir=cache_dir,
|
207
|
+
tqdm_class=DisabledTqdm,
|
208
|
+
revision=revision,
|
209
|
+
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
210
|
+
)
|
211
|
+
return hf_folder
|
212
|
+
|
213
|
+
|
214
|
+
def filter_files_not_needed_for_inference(
|
215
|
+
hf_weights_files: List[str]) -> List[str]:
|
216
|
+
"""
|
217
|
+
Exclude files that are not needed for inference.
|
218
|
+
|
219
|
+
See https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
|
220
|
+
"""
|
221
|
+
blacklist = [
|
222
|
+
"training_args.bin",
|
223
|
+
"optimizer.bin",
|
224
|
+
"optimizer.pt",
|
225
|
+
"scheduler.pt",
|
226
|
+
"scaler.pt",
|
227
|
+
]
|
228
|
+
hf_weights_files = [
|
229
|
+
f for f in hf_weights_files
|
230
|
+
if not any(f.endswith(x) for x in blacklist)
|
231
|
+
]
|
232
|
+
return hf_weights_files
|
233
|
+
|
234
|
+
|
235
|
+
def np_cache_weights_iterator(
|
236
|
+
model_name_or_path: str, cache_dir: Optional[str], hf_folder: str,
|
237
|
+
hf_weights_files: List[str]
|
238
|
+
) -> Generator[Tuple[str, torch.Tensor], None, None]:
|
239
|
+
"""Iterate over the weights in the model np files.
|
240
|
+
|
241
|
+
Will dump the model weights to numpy files if they are not already dumped.
|
242
|
+
"""
|
243
|
+
# Convert the model weights from torch tensors to numpy arrays for
|
244
|
+
# faster loading.
|
245
|
+
np_folder = os.path.join(hf_folder, "np")
|
246
|
+
os.makedirs(np_folder, exist_ok=True)
|
247
|
+
weight_names_file = os.path.join(np_folder, "weight_names.json")
|
248
|
+
# Use file lock to prevent multiple processes from
|
249
|
+
# dumping the same model weights to numpy at the same time.
|
250
|
+
with get_lock(model_name_or_path, cache_dir):
|
251
|
+
if not os.path.exists(weight_names_file):
|
252
|
+
weight_names = []
|
253
|
+
for bin_file in hf_weights_files:
|
254
|
+
state = torch.load(bin_file, map_location="cpu")
|
255
|
+
for name, param in state.items():
|
256
|
+
param_path = os.path.join(np_folder, name)
|
257
|
+
with open(param_path, "wb") as f:
|
258
|
+
np.save(f, param.cpu().detach().numpy())
|
259
|
+
weight_names.append(name)
|
260
|
+
with open(weight_names_file, "w") as f:
|
261
|
+
json.dump(weight_names, f)
|
262
|
+
|
263
|
+
with open(weight_names_file, "r") as f:
|
264
|
+
weight_names = json.load(f)
|
265
|
+
|
266
|
+
for name in weight_names:
|
267
|
+
param_path = os.path.join(np_folder, name)
|
268
|
+
with open(param_path, "rb") as f:
|
269
|
+
param = np.load(f)
|
270
|
+
yield name, torch.from_numpy(param)
|
271
|
+
|
272
|
+
|
273
|
+
def safetensors_weights_iterator(
|
274
|
+
hf_weights_files: List[str]
|
275
|
+
) -> Generator[Tuple[str, torch.Tensor], None, None]:
|
276
|
+
"""Iterate over the weights in the model safetensor files."""
|
277
|
+
for st_file in hf_weights_files:
|
278
|
+
with safe_open(st_file, framework="pt") as f:
|
279
|
+
for name in f.keys(): # noqa: SIM118
|
280
|
+
param = f.get_tensor(name)
|
281
|
+
yield name, param
|
282
|
+
|
283
|
+
|
284
|
+
def pt_weights_iterator(
|
285
|
+
hf_weights_files: List[str]
|
286
|
+
) -> Generator[Tuple[str, torch.Tensor], None, None]:
|
287
|
+
"""Iterate over the weights in the model bin/pt files."""
|
288
|
+
for bin_file in hf_weights_files:
|
289
|
+
state = torch.load(bin_file, map_location="cpu")
|
290
|
+
for name, param in state.items():
|
291
|
+
yield name, param
|
292
|
+
del state
|
293
|
+
torch.cuda.empty_cache()
|
294
|
+
|
295
|
+
|
296
|
+
def kv_cache_scales_loader(
|
297
|
+
filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int,
|
298
|
+
model_type: Optional[str]) -> Iterable[Tuple[int, float]]:
|
299
|
+
"""
|
300
|
+
A simple utility to read in KV cache scaling factors that have been
|
301
|
+
previously serialized to disk. Used by the model to populate the appropriate
|
302
|
+
KV cache scaling factors. The serialization should represent a dictionary
|
303
|
+
whose keys are the TP ranks and values are another dictionary mapping layers
|
304
|
+
to their KV cache scaling factors.
|
305
|
+
Keep this function in sync with the output of examples/fp8/extract_scales.py
|
306
|
+
"""
|
307
|
+
try:
|
308
|
+
with open(filename) as f:
|
309
|
+
context = {
|
310
|
+
"model_type": model_type,
|
311
|
+
"num_hidden_layers": num_hidden_layers,
|
312
|
+
"tp_rank": tp_rank,
|
313
|
+
"tp_size": tp_size,
|
314
|
+
}
|
315
|
+
schema_dct = json.load(f)
|
316
|
+
schema = QuantParamSchema.model_validate(schema_dct,
|
317
|
+
context=context)
|
318
|
+
layer_scales_map = schema.kv_cache.scaling_factor[tp_rank]
|
319
|
+
return layer_scales_map.items()
|
320
|
+
|
321
|
+
except FileNotFoundError:
|
322
|
+
logger.error("File or directory '%s' not found.", filename)
|
323
|
+
except json.JSONDecodeError:
|
324
|
+
logger.error("Error decoding JSON in file '%s'.", filename)
|
325
|
+
except Exception as e:
|
326
|
+
logger.error("An error occurred while reading '%s': %s", filename, e)
|
327
|
+
# This section is reached if and only if any of the excepts are hit
|
328
|
+
# Return an empty iterable (list) => no KV cache scales are loaded
|
329
|
+
# which ultimately defaults to 1.0 scales
|
330
|
+
logger.warning(
|
331
|
+
"Defaulting to KV cache scaling factors = 1.0 for all "
|
332
|
+
"layers in TP rank %d as an error occurred during loading.", tp_rank)
|
333
|
+
return []
|
334
|
+
|
335
|
+
|
336
|
+
def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
|
337
|
+
"""convert PySafeSlice object from safetensors to torch.Tensor
|
338
|
+
|
339
|
+
PySafeSlice object supports indexing, which is done before loading the
|
340
|
+
actual tensor and can reduce the amount of memory being read into the
|
341
|
+
memory. However, it does not support more advanced functionalities
|
342
|
+
like `.view()` or `.t()`. Therefore, if we need to modify the loaded
|
343
|
+
tensor with these more complicated operators, we need to convert to
|
344
|
+
tensor first.
|
345
|
+
"""
|
346
|
+
if not isinstance(x, torch.Tensor):
|
347
|
+
x = x[:]
|
348
|
+
return x
|
349
|
+
|
350
|
+
|
351
|
+
def default_weight_loader(param: torch.Tensor,
|
352
|
+
loaded_weight: torch.Tensor) -> None:
|
353
|
+
"""Default weight loader."""
|
354
|
+
assert param.size() == loaded_weight.size()
|
355
|
+
param.data.copy_(loaded_weight)
|
356
|
+
|
357
|
+
|
358
|
+
def initialize_dummy_weights(
|
359
|
+
model: torch.nn.Module,
|
360
|
+
low: float = -1e-3,
|
361
|
+
high: float = 1e-3,
|
362
|
+
) -> None:
|
363
|
+
"""Initialize model weights with random values.
|
364
|
+
|
365
|
+
The model weights must be randomly initialized for accurate performance
|
366
|
+
measurements. Additionally, the model weights should not cause NaNs in the
|
367
|
+
forward pass. We empirically found that initializing the weights with
|
368
|
+
values between -1e-3 and 1e-3 works well for most models.
|
369
|
+
"""
|
370
|
+
for param in model.state_dict().values():
|
371
|
+
if torch.is_floating_point(param):
|
372
|
+
param.data.uniform_(low, high)
|
@@ -0,0 +1,119 @@
|
|
1
|
+
import importlib
|
2
|
+
from typing import Dict, List, Optional, Type
|
3
|
+
|
4
|
+
import torch.nn as nn
|
5
|
+
|
6
|
+
from vllm.logger import init_logger
|
7
|
+
from vllm.utils import is_hip
|
8
|
+
|
9
|
+
logger = init_logger(__name__)
|
10
|
+
|
11
|
+
# Architecture -> (module, class).
|
12
|
+
_MODELS = {
|
13
|
+
"AquilaModel": ("llama", "LlamaForCausalLM"),
|
14
|
+
"AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2
|
15
|
+
"BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b
|
16
|
+
"BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b
|
17
|
+
"BloomForCausalLM": ("bloom", "BloomForCausalLM"),
|
18
|
+
"ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
|
19
|
+
"ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
|
20
|
+
"CohereForCausalLM": ("commandr", "CohereForCausalLM"),
|
21
|
+
"DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
|
22
|
+
"DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
|
23
|
+
"DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
|
24
|
+
"FalconForCausalLM": ("falcon", "FalconForCausalLM"),
|
25
|
+
"GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
|
26
|
+
"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
|
27
|
+
"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
|
28
|
+
"GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
|
29
|
+
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
|
30
|
+
"InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
|
31
|
+
"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
|
32
|
+
"JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
|
33
|
+
"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
|
34
|
+
"LlavaForConditionalGeneration":
|
35
|
+
("llava", "LlavaForConditionalGeneration"),
|
36
|
+
# For decapoda-research/llama-*
|
37
|
+
"LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
|
38
|
+
"MistralForCausalLM": ("llama", "LlamaForCausalLM"),
|
39
|
+
"MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
|
40
|
+
"QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
|
41
|
+
# transformers's mpt class has lower case
|
42
|
+
"MptForCausalLM": ("mpt", "MPTForCausalLM"),
|
43
|
+
"MPTForCausalLM": ("mpt", "MPTForCausalLM"),
|
44
|
+
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
|
45
|
+
"OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
|
46
|
+
"OPTForCausalLM": ("opt", "OPTForCausalLM"),
|
47
|
+
"OrionForCausalLM": ("orion", "OrionForCausalLM"),
|
48
|
+
"PhiForCausalLM": ("phi", "PhiForCausalLM"),
|
49
|
+
"Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
|
50
|
+
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
|
51
|
+
"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
|
52
|
+
"Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
|
53
|
+
"RWForCausalLM": ("falcon", "FalconForCausalLM"),
|
54
|
+
"StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
|
55
|
+
"StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
|
56
|
+
"Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
|
57
|
+
"XverseForCausalLM": ("xverse", "XverseForCausalLM"),
|
58
|
+
}
|
59
|
+
|
60
|
+
# Architecture -> type.
|
61
|
+
# out of tree models
|
62
|
+
_OOT_MODELS: Dict[str, Type[nn.Module]] = {}
|
63
|
+
|
64
|
+
# Models not supported by ROCm.
|
65
|
+
_ROCM_UNSUPPORTED_MODELS = []
|
66
|
+
|
67
|
+
# Models partially supported by ROCm.
|
68
|
+
# Architecture -> Reason.
|
69
|
+
_ROCM_PARTIALLY_SUPPORTED_MODELS = {
|
70
|
+
"Qwen2ForCausalLM":
|
71
|
+
"Sliding window attention is not yet supported in ROCm's flash attention",
|
72
|
+
"MistralForCausalLM":
|
73
|
+
"Sliding window attention is not yet supported in ROCm's flash attention",
|
74
|
+
"MixtralForCausalLM":
|
75
|
+
"Sliding window attention is not yet supported in ROCm's flash attention",
|
76
|
+
}
|
77
|
+
|
78
|
+
|
79
|
+
class ModelRegistry:
|
80
|
+
|
81
|
+
@staticmethod
|
82
|
+
def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
|
83
|
+
if model_arch in _OOT_MODELS:
|
84
|
+
return _OOT_MODELS[model_arch]
|
85
|
+
if model_arch not in _MODELS:
|
86
|
+
return None
|
87
|
+
if is_hip():
|
88
|
+
if model_arch in _ROCM_UNSUPPORTED_MODELS:
|
89
|
+
raise ValueError(
|
90
|
+
f"Model architecture {model_arch} is not supported by "
|
91
|
+
"ROCm for now.")
|
92
|
+
if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
|
93
|
+
logger.warning(
|
94
|
+
"Model architecture %s is partially supported by ROCm: %s",
|
95
|
+
model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
|
96
|
+
|
97
|
+
module_name, model_cls_name = _MODELS[model_arch]
|
98
|
+
module = importlib.import_module(
|
99
|
+
f"vllm.model_executor.models.{module_name}")
|
100
|
+
return getattr(module, model_cls_name, None)
|
101
|
+
|
102
|
+
@staticmethod
|
103
|
+
def get_supported_archs() -> List[str]:
|
104
|
+
return list(_MODELS.keys())
|
105
|
+
|
106
|
+
@staticmethod
|
107
|
+
def register_model(model_arch: str, model_cls: Type[nn.Module]):
|
108
|
+
if model_arch in _MODELS:
|
109
|
+
logger.warning(
|
110
|
+
"Model architecture %s is already registered, and will be "
|
111
|
+
"overwritten by the new model class %s.", model_arch,
|
112
|
+
model_cls.__name__)
|
113
|
+
global _OOT_MODELS
|
114
|
+
_OOT_MODELS[model_arch] = model_cls
|
115
|
+
|
116
|
+
|
117
|
+
__all__ = [
|
118
|
+
"ModelRegistry",
|
119
|
+
]
|