vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,372 @@
1
+ """Utilities for downloading and initializing model weights."""
2
+ import fnmatch
3
+ import glob
4
+ import hashlib
5
+ import json
6
+ import os
7
+ import tempfile
8
+ from collections import defaultdict
9
+ from typing import Any, Generator, Iterable, List, Optional, Tuple
10
+
11
+ import filelock
12
+ import huggingface_hub.constants
13
+ import numpy as np
14
+ import torch
15
+ from huggingface_hub import HfFileSystem, snapshot_download
16
+ from safetensors.torch import load_file, safe_open, save_file
17
+ from tqdm.auto import tqdm
18
+
19
+ from vllm.config import LoadConfig, ModelConfig
20
+ from vllm.logger import init_logger
21
+ from vllm.model_executor.layers.quantization import (QuantizationConfig,
22
+ get_quantization_config)
23
+ from vllm.model_executor.layers.quantization.schema import QuantParamSchema
24
+
25
+ logger = init_logger(__name__)
26
+
27
+ # use system-level temp directory for file locks, so that multiple users
28
+ # can share the same lock without error.
29
+ # lock files in the temp directory will be automatically deleted when the
30
+ # system reboots, so users will not complain about annoying lock files
31
+ temp_dir = tempfile.gettempdir()
32
+
33
+
34
+ def enable_hf_transfer():
35
+ """automatically activates hf_transfer
36
+ """
37
+ if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
38
+ try:
39
+ # enable hf hub transfer if available
40
+ import hf_transfer # type: ignore # noqa
41
+ huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
42
+ except ImportError:
43
+ pass
44
+
45
+
46
+ enable_hf_transfer()
47
+
48
+
49
+ class DisabledTqdm(tqdm):
50
+
51
+ def __init__(self, *args, **kwargs):
52
+ super().__init__(*args, **kwargs, disable=True)
53
+
54
+
55
+ def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
56
+ lock_dir = cache_dir or temp_dir
57
+ os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
58
+ model_name = model_name_or_path.replace("/", "-")
59
+ hash_name = hashlib.sha256(model_name.encode()).hexdigest()
60
+ # add hash to avoid conflict with old users' lock files
61
+ lock_file_name = hash_name + model_name + ".lock"
62
+ # mode 0o666 is required for the filelock to be shared across users
63
+ lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
64
+ mode=0o666)
65
+ return lock
66
+
67
+
68
+ def _shared_pointers(tensors):
69
+ ptrs = defaultdict(list)
70
+ for k, v in tensors.items():
71
+ ptrs[v.data_ptr()].append(k)
72
+ failing = []
73
+ for _, names in ptrs.items():
74
+ if len(names) > 1:
75
+ failing.append(names)
76
+ return failing
77
+
78
+
79
+ def convert_bin_to_safetensor_file(
80
+ pt_filename: str,
81
+ sf_filename: str,
82
+ ) -> None:
83
+ loaded = torch.load(pt_filename, map_location="cpu")
84
+ if "state_dict" in loaded:
85
+ loaded = loaded["state_dict"]
86
+ shared = _shared_pointers(loaded)
87
+ for shared_weights in shared:
88
+ for name in shared_weights[1:]:
89
+ loaded.pop(name)
90
+
91
+ # For tensors to be contiguous
92
+ loaded = {k: v.contiguous() for k, v in loaded.items()}
93
+
94
+ dirname = os.path.dirname(sf_filename)
95
+ os.makedirs(dirname, exist_ok=True)
96
+ save_file(loaded, sf_filename, metadata={"format": "pt"})
97
+
98
+ # check file size
99
+ sf_size = os.stat(sf_filename).st_size
100
+ pt_size = os.stat(pt_filename).st_size
101
+ if (sf_size - pt_size) / pt_size > 0.01:
102
+ raise RuntimeError(f"""The file size different is more than 1%:
103
+ - {sf_filename}: {sf_size}
104
+ - {pt_filename}: {pt_size}
105
+ """)
106
+
107
+ # check if the tensors are the same
108
+ reloaded = load_file(sf_filename)
109
+ for k in loaded:
110
+ pt_tensor = loaded[k]
111
+ sf_tensor = reloaded[k]
112
+ if not torch.equal(pt_tensor, sf_tensor):
113
+ raise RuntimeError(f"The output tensors do not match for key {k}")
114
+
115
+
116
+ # TODO(woosuk): Move this to other place.
117
+ def get_quant_config(model_config: ModelConfig,
118
+ load_config: LoadConfig) -> QuantizationConfig:
119
+ quant_cls = get_quantization_config(model_config.quantization)
120
+ # Read the quantization config from the HF model config, if available.
121
+ hf_quant_config = getattr(model_config.hf_config, "quantization_config",
122
+ None)
123
+ if hf_quant_config is not None:
124
+ return quant_cls.from_config(hf_quant_config)
125
+ model_name_or_path = model_config.model
126
+ is_local = os.path.isdir(model_name_or_path)
127
+ if not is_local:
128
+ # Download the config files.
129
+ with get_lock(model_name_or_path, load_config.download_dir):
130
+ hf_folder = snapshot_download(
131
+ model_name_or_path,
132
+ revision=model_config.revision,
133
+ allow_patterns="*.json",
134
+ cache_dir=load_config.download_dir,
135
+ local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
136
+ tqdm_class=DisabledTqdm,
137
+ )
138
+ else:
139
+ hf_folder = model_name_or_path
140
+
141
+ possible_config_filenames = quant_cls.get_config_filenames()
142
+
143
+ # If the quantization config is not found, use the default config.
144
+ if not possible_config_filenames:
145
+ return quant_cls()
146
+
147
+ config_files = glob.glob(os.path.join(hf_folder, "*.json"))
148
+
149
+ quant_config_files = [
150
+ f for f in config_files if any(
151
+ f.endswith(x) for x in possible_config_filenames)
152
+ ]
153
+ if len(quant_config_files) == 0:
154
+ raise ValueError(
155
+ f"Cannot find the config file for {model_config.quantization}")
156
+ if len(quant_config_files) > 1:
157
+ raise ValueError(
158
+ f"Found multiple config files for {model_config.quantization}: "
159
+ f"{quant_config_files}")
160
+
161
+ quant_config_file = quant_config_files[0]
162
+ with open(quant_config_file, "r") as f:
163
+ config = json.load(f)
164
+ return quant_cls.from_config(config)
165
+
166
+
167
+ def download_weights_from_hf(
168
+ model_name_or_path: str,
169
+ cache_dir: Optional[str],
170
+ allow_patterns: List[str],
171
+ revision: Optional[str] = None,
172
+ ) -> str:
173
+ """Download model weights from Hugging Face Hub.
174
+
175
+ Args:
176
+ model_name_or_path (str): The model name or path.
177
+ cache_dir (Optional[str]): The cache directory to store the model
178
+ weights. If None, will use HF defaults.
179
+ allow_patterns (List[str]): The allowed patterns for the
180
+ weight files. Files matched by any of the patterns will be
181
+ downloaded.
182
+ revision (Optional[str]): The revision of the model.
183
+
184
+ Returns:
185
+ str: The path to the downloaded model weights.
186
+ """
187
+ if not huggingface_hub.constants.HF_HUB_OFFLINE:
188
+ # Before we download we look at that is available:
189
+ fs = HfFileSystem()
190
+ file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
191
+
192
+ # depending on what is available we download different things
193
+ for pattern in allow_patterns:
194
+ matching = fnmatch.filter(file_list, pattern)
195
+ if len(matching) > 0:
196
+ allow_patterns = [pattern]
197
+ break
198
+
199
+ logger.info("Using model weights format %s", allow_patterns)
200
+ # Use file lock to prevent multiple processes from
201
+ # downloading the same model weights at the same time.
202
+ with get_lock(model_name_or_path, cache_dir):
203
+ hf_folder = snapshot_download(
204
+ model_name_or_path,
205
+ allow_patterns=allow_patterns,
206
+ cache_dir=cache_dir,
207
+ tqdm_class=DisabledTqdm,
208
+ revision=revision,
209
+ local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
210
+ )
211
+ return hf_folder
212
+
213
+
214
+ def filter_files_not_needed_for_inference(
215
+ hf_weights_files: List[str]) -> List[str]:
216
+ """
217
+ Exclude files that are not needed for inference.
218
+
219
+ See https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
220
+ """
221
+ blacklist = [
222
+ "training_args.bin",
223
+ "optimizer.bin",
224
+ "optimizer.pt",
225
+ "scheduler.pt",
226
+ "scaler.pt",
227
+ ]
228
+ hf_weights_files = [
229
+ f for f in hf_weights_files
230
+ if not any(f.endswith(x) for x in blacklist)
231
+ ]
232
+ return hf_weights_files
233
+
234
+
235
+ def np_cache_weights_iterator(
236
+ model_name_or_path: str, cache_dir: Optional[str], hf_folder: str,
237
+ hf_weights_files: List[str]
238
+ ) -> Generator[Tuple[str, torch.Tensor], None, None]:
239
+ """Iterate over the weights in the model np files.
240
+
241
+ Will dump the model weights to numpy files if they are not already dumped.
242
+ """
243
+ # Convert the model weights from torch tensors to numpy arrays for
244
+ # faster loading.
245
+ np_folder = os.path.join(hf_folder, "np")
246
+ os.makedirs(np_folder, exist_ok=True)
247
+ weight_names_file = os.path.join(np_folder, "weight_names.json")
248
+ # Use file lock to prevent multiple processes from
249
+ # dumping the same model weights to numpy at the same time.
250
+ with get_lock(model_name_or_path, cache_dir):
251
+ if not os.path.exists(weight_names_file):
252
+ weight_names = []
253
+ for bin_file in hf_weights_files:
254
+ state = torch.load(bin_file, map_location="cpu")
255
+ for name, param in state.items():
256
+ param_path = os.path.join(np_folder, name)
257
+ with open(param_path, "wb") as f:
258
+ np.save(f, param.cpu().detach().numpy())
259
+ weight_names.append(name)
260
+ with open(weight_names_file, "w") as f:
261
+ json.dump(weight_names, f)
262
+
263
+ with open(weight_names_file, "r") as f:
264
+ weight_names = json.load(f)
265
+
266
+ for name in weight_names:
267
+ param_path = os.path.join(np_folder, name)
268
+ with open(param_path, "rb") as f:
269
+ param = np.load(f)
270
+ yield name, torch.from_numpy(param)
271
+
272
+
273
+ def safetensors_weights_iterator(
274
+ hf_weights_files: List[str]
275
+ ) -> Generator[Tuple[str, torch.Tensor], None, None]:
276
+ """Iterate over the weights in the model safetensor files."""
277
+ for st_file in hf_weights_files:
278
+ with safe_open(st_file, framework="pt") as f:
279
+ for name in f.keys(): # noqa: SIM118
280
+ param = f.get_tensor(name)
281
+ yield name, param
282
+
283
+
284
+ def pt_weights_iterator(
285
+ hf_weights_files: List[str]
286
+ ) -> Generator[Tuple[str, torch.Tensor], None, None]:
287
+ """Iterate over the weights in the model bin/pt files."""
288
+ for bin_file in hf_weights_files:
289
+ state = torch.load(bin_file, map_location="cpu")
290
+ for name, param in state.items():
291
+ yield name, param
292
+ del state
293
+ torch.cuda.empty_cache()
294
+
295
+
296
+ def kv_cache_scales_loader(
297
+ filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int,
298
+ model_type: Optional[str]) -> Iterable[Tuple[int, float]]:
299
+ """
300
+ A simple utility to read in KV cache scaling factors that have been
301
+ previously serialized to disk. Used by the model to populate the appropriate
302
+ KV cache scaling factors. The serialization should represent a dictionary
303
+ whose keys are the TP ranks and values are another dictionary mapping layers
304
+ to their KV cache scaling factors.
305
+ Keep this function in sync with the output of examples/fp8/extract_scales.py
306
+ """
307
+ try:
308
+ with open(filename) as f:
309
+ context = {
310
+ "model_type": model_type,
311
+ "num_hidden_layers": num_hidden_layers,
312
+ "tp_rank": tp_rank,
313
+ "tp_size": tp_size,
314
+ }
315
+ schema_dct = json.load(f)
316
+ schema = QuantParamSchema.model_validate(schema_dct,
317
+ context=context)
318
+ layer_scales_map = schema.kv_cache.scaling_factor[tp_rank]
319
+ return layer_scales_map.items()
320
+
321
+ except FileNotFoundError:
322
+ logger.error("File or directory '%s' not found.", filename)
323
+ except json.JSONDecodeError:
324
+ logger.error("Error decoding JSON in file '%s'.", filename)
325
+ except Exception as e:
326
+ logger.error("An error occurred while reading '%s': %s", filename, e)
327
+ # This section is reached if and only if any of the excepts are hit
328
+ # Return an empty iterable (list) => no KV cache scales are loaded
329
+ # which ultimately defaults to 1.0 scales
330
+ logger.warning(
331
+ "Defaulting to KV cache scaling factors = 1.0 for all "
332
+ "layers in TP rank %d as an error occurred during loading.", tp_rank)
333
+ return []
334
+
335
+
336
+ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
337
+ """convert PySafeSlice object from safetensors to torch.Tensor
338
+
339
+ PySafeSlice object supports indexing, which is done before loading the
340
+ actual tensor and can reduce the amount of memory being read into the
341
+ memory. However, it does not support more advanced functionalities
342
+ like `.view()` or `.t()`. Therefore, if we need to modify the loaded
343
+ tensor with these more complicated operators, we need to convert to
344
+ tensor first.
345
+ """
346
+ if not isinstance(x, torch.Tensor):
347
+ x = x[:]
348
+ return x
349
+
350
+
351
+ def default_weight_loader(param: torch.Tensor,
352
+ loaded_weight: torch.Tensor) -> None:
353
+ """Default weight loader."""
354
+ assert param.size() == loaded_weight.size()
355
+ param.data.copy_(loaded_weight)
356
+
357
+
358
+ def initialize_dummy_weights(
359
+ model: torch.nn.Module,
360
+ low: float = -1e-3,
361
+ high: float = 1e-3,
362
+ ) -> None:
363
+ """Initialize model weights with random values.
364
+
365
+ The model weights must be randomly initialized for accurate performance
366
+ measurements. Additionally, the model weights should not cause NaNs in the
367
+ forward pass. We empirically found that initializing the weights with
368
+ values between -1e-3 and 1e-3 works well for most models.
369
+ """
370
+ for param in model.state_dict().values():
371
+ if torch.is_floating_point(param):
372
+ param.data.uniform_(low, high)
@@ -0,0 +1,119 @@
1
+ import importlib
2
+ from typing import Dict, List, Optional, Type
3
+
4
+ import torch.nn as nn
5
+
6
+ from vllm.logger import init_logger
7
+ from vllm.utils import is_hip
8
+
9
+ logger = init_logger(__name__)
10
+
11
+ # Architecture -> (module, class).
12
+ _MODELS = {
13
+ "AquilaModel": ("llama", "LlamaForCausalLM"),
14
+ "AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2
15
+ "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b
16
+ "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b
17
+ "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
18
+ "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
19
+ "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
20
+ "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
21
+ "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
22
+ "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
23
+ "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
24
+ "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
25
+ "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
26
+ "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
27
+ "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
28
+ "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
29
+ "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
30
+ "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
31
+ "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
32
+ "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
33
+ "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
34
+ "LlavaForConditionalGeneration":
35
+ ("llava", "LlavaForConditionalGeneration"),
36
+ # For decapoda-research/llama-*
37
+ "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
38
+ "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
39
+ "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
40
+ "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
41
+ # transformers's mpt class has lower case
42
+ "MptForCausalLM": ("mpt", "MPTForCausalLM"),
43
+ "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
44
+ "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
45
+ "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
46
+ "OPTForCausalLM": ("opt", "OPTForCausalLM"),
47
+ "OrionForCausalLM": ("orion", "OrionForCausalLM"),
48
+ "PhiForCausalLM": ("phi", "PhiForCausalLM"),
49
+ "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
50
+ "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
51
+ "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
52
+ "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
53
+ "RWForCausalLM": ("falcon", "FalconForCausalLM"),
54
+ "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
55
+ "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
56
+ "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
57
+ "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
58
+ }
59
+
60
+ # Architecture -> type.
61
+ # out of tree models
62
+ _OOT_MODELS: Dict[str, Type[nn.Module]] = {}
63
+
64
+ # Models not supported by ROCm.
65
+ _ROCM_UNSUPPORTED_MODELS = []
66
+
67
+ # Models partially supported by ROCm.
68
+ # Architecture -> Reason.
69
+ _ROCM_PARTIALLY_SUPPORTED_MODELS = {
70
+ "Qwen2ForCausalLM":
71
+ "Sliding window attention is not yet supported in ROCm's flash attention",
72
+ "MistralForCausalLM":
73
+ "Sliding window attention is not yet supported in ROCm's flash attention",
74
+ "MixtralForCausalLM":
75
+ "Sliding window attention is not yet supported in ROCm's flash attention",
76
+ }
77
+
78
+
79
+ class ModelRegistry:
80
+
81
+ @staticmethod
82
+ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
83
+ if model_arch in _OOT_MODELS:
84
+ return _OOT_MODELS[model_arch]
85
+ if model_arch not in _MODELS:
86
+ return None
87
+ if is_hip():
88
+ if model_arch in _ROCM_UNSUPPORTED_MODELS:
89
+ raise ValueError(
90
+ f"Model architecture {model_arch} is not supported by "
91
+ "ROCm for now.")
92
+ if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
93
+ logger.warning(
94
+ "Model architecture %s is partially supported by ROCm: %s",
95
+ model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
96
+
97
+ module_name, model_cls_name = _MODELS[model_arch]
98
+ module = importlib.import_module(
99
+ f"vllm.model_executor.models.{module_name}")
100
+ return getattr(module, model_cls_name, None)
101
+
102
+ @staticmethod
103
+ def get_supported_archs() -> List[str]:
104
+ return list(_MODELS.keys())
105
+
106
+ @staticmethod
107
+ def register_model(model_arch: str, model_cls: Type[nn.Module]):
108
+ if model_arch in _MODELS:
109
+ logger.warning(
110
+ "Model architecture %s is already registered, and will be "
111
+ "overwritten by the new model class %s.", model_arch,
112
+ model_cls.__name__)
113
+ global _OOT_MODELS
114
+ _OOT_MODELS[model_arch] = model_cls
115
+
116
+
117
+ __all__ = [
118
+ "ModelRegistry",
119
+ ]