vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,649 @@
1
+ import argparse
2
+ import dataclasses
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional, Union
5
+
6
+ from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
7
+ EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
8
+ ParallelConfig, SchedulerConfig, SpeculativeConfig,
9
+ TokenizerPoolConfig, VisionLanguageConfig)
10
+ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
11
+ from vllm.utils import str_to_int_tuple
12
+
13
+
14
+ def nullable_str(val: str):
15
+ if not val or val == "None":
16
+ return None
17
+ return val
18
+
19
+
20
+ @dataclass
21
+ class EngineArgs:
22
+ """Arguments for vLLM engine."""
23
+ model: str
24
+ served_model_name: Optional[Union[List[str]]] = None
25
+ tokenizer: Optional[str] = None
26
+ skip_tokenizer_init: bool = False
27
+ tokenizer_mode: str = 'auto'
28
+ trust_remote_code: bool = False
29
+ download_dir: Optional[str] = None
30
+ load_format: str = 'auto'
31
+ dtype: str = 'auto'
32
+ kv_cache_dtype: str = 'auto'
33
+ quantization_param_path: Optional[str] = None
34
+ seed: int = 0
35
+ max_model_len: Optional[int] = None
36
+ worker_use_ray: bool = False
37
+ pipeline_parallel_size: int = 1
38
+ tensor_parallel_size: int = 1
39
+ max_parallel_loading_workers: Optional[int] = None
40
+ block_size: int = 16
41
+ enable_prefix_caching: bool = False
42
+ use_v2_block_manager: bool = False
43
+ swap_space: int = 4 # GiB
44
+ gpu_memory_utilization: float = 0.90
45
+ max_num_batched_tokens: Optional[int] = None
46
+ max_num_seqs: int = 256
47
+ max_logprobs: int = 5 # OpenAI default value
48
+ disable_log_stats: bool = False
49
+ revision: Optional[str] = None
50
+ code_revision: Optional[str] = None
51
+ tokenizer_revision: Optional[str] = None
52
+ quantization: Optional[str] = None
53
+ enforce_eager: bool = False
54
+ max_context_len_to_capture: Optional[int] = None
55
+ max_seq_len_to_capture: int = 8192
56
+ disable_custom_all_reduce: bool = False
57
+ tokenizer_pool_size: int = 0
58
+ tokenizer_pool_type: str = "ray"
59
+ tokenizer_pool_extra_config: Optional[dict] = None
60
+ enable_lora: bool = False
61
+ max_loras: int = 1
62
+ max_lora_rank: int = 16
63
+ fully_sharded_loras: bool = False
64
+ lora_extra_vocab_size: int = 256
65
+ lora_dtype = 'auto'
66
+ max_cpu_loras: Optional[int] = None
67
+ device: str = 'auto'
68
+ ray_workers_use_nsight: bool = False
69
+ num_gpu_blocks_override: Optional[int] = None
70
+ num_lookahead_slots: int = 0
71
+ model_loader_extra_config: Optional[dict] = None
72
+
73
+ # Related to Vision-language models such as llava
74
+ image_input_type: Optional[str] = None
75
+ image_token_id: Optional[int] = None
76
+ image_input_shape: Optional[str] = None
77
+ image_feature_size: Optional[int] = None
78
+ scheduler_delay_factor: float = 0.0
79
+ enable_chunked_prefill: bool = False
80
+
81
+ guided_decoding_backend: str = 'outlines'
82
+ # Speculative decoding configuration.
83
+ speculative_model: Optional[str] = None
84
+ num_speculative_tokens: Optional[int] = None
85
+ speculative_max_model_len: Optional[int] = None
86
+ ngram_prompt_lookup_max: Optional[int] = None
87
+ ngram_prompt_lookup_min: Optional[int] = None
88
+
89
+ def __post_init__(self):
90
+ if self.tokenizer is None:
91
+ self.tokenizer = self.model
92
+
93
+ @staticmethod
94
+ def add_cli_args(
95
+ parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
96
+ """Shared CLI arguments for vLLM engine."""
97
+
98
+ # Model arguments
99
+ parser.add_argument(
100
+ '--model',
101
+ type=str,
102
+ default='facebook/opt-125m',
103
+ help='Name or path of the huggingface model to use.')
104
+ parser.add_argument(
105
+ '--tokenizer',
106
+ type=nullable_str,
107
+ default=EngineArgs.tokenizer,
108
+ help='Name or path of the huggingface tokenizer to use.')
109
+ parser.add_argument(
110
+ '--skip-tokenizer-init',
111
+ action='store_true',
112
+ help='Skip initialization of tokenizer and detokenizer')
113
+ parser.add_argument(
114
+ '--revision',
115
+ type=nullable_str,
116
+ default=None,
117
+ help='The specific model version to use. It can be a branch '
118
+ 'name, a tag name, or a commit id. If unspecified, will use '
119
+ 'the default version.')
120
+ parser.add_argument(
121
+ '--code-revision',
122
+ type=nullable_str,
123
+ default=None,
124
+ help='The specific revision to use for the model code on '
125
+ 'Hugging Face Hub. It can be a branch name, a tag name, or a '
126
+ 'commit id. If unspecified, will use the default version.')
127
+ parser.add_argument(
128
+ '--tokenizer-revision',
129
+ type=nullable_str,
130
+ default=None,
131
+ help='The specific tokenizer version to use. It can be a branch '
132
+ 'name, a tag name, or a commit id. If unspecified, will use '
133
+ 'the default version.')
134
+ parser.add_argument(
135
+ '--tokenizer-mode',
136
+ type=str,
137
+ default=EngineArgs.tokenizer_mode,
138
+ choices=['auto', 'slow'],
139
+ help='The tokenizer mode.\n\n* "auto" will use the '
140
+ 'fast tokenizer if available.\n* "slow" will '
141
+ 'always use the slow tokenizer.')
142
+ parser.add_argument('--trust-remote-code',
143
+ action='store_true',
144
+ help='Trust remote code from huggingface.')
145
+ parser.add_argument('--download-dir',
146
+ type=nullable_str,
147
+ default=EngineArgs.download_dir,
148
+ help='Directory to download and load the weights, '
149
+ 'default to the default cache dir of '
150
+ 'huggingface.')
151
+ parser.add_argument(
152
+ '--load-format',
153
+ type=str,
154
+ default=EngineArgs.load_format,
155
+ choices=[
156
+ 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer'
157
+ ],
158
+ help='The format of the model weights to load.\n\n'
159
+ '* "auto" will try to load the weights in the safetensors format '
160
+ 'and fall back to the pytorch bin format if safetensors format '
161
+ 'is not available.\n'
162
+ '* "pt" will load the weights in the pytorch bin format.\n'
163
+ '* "safetensors" will load the weights in the safetensors format.\n'
164
+ '* "npcache" will load the weights in pytorch format and store '
165
+ 'a numpy cache to speed up the loading.\n'
166
+ '* "dummy" will initialize the weights with random values, '
167
+ 'which is mainly for profiling.\n'
168
+ '* "tensorizer" will load the weights using tensorizer from '
169
+ 'CoreWeave which assumes tensorizer_uri is set to the location of '
170
+ 'the serialized weights.')
171
+ parser.add_argument(
172
+ '--dtype',
173
+ type=str,
174
+ default=EngineArgs.dtype,
175
+ choices=[
176
+ 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
177
+ ],
178
+ help='Data type for model weights and activations.\n\n'
179
+ '* "auto" will use FP16 precision for FP32 and FP16 models, and '
180
+ 'BF16 precision for BF16 models.\n'
181
+ '* "half" for FP16. Recommended for AWQ quantization.\n'
182
+ '* "float16" is the same as "half".\n'
183
+ '* "bfloat16" for a balance between precision and range.\n'
184
+ '* "float" is shorthand for FP32 precision.\n'
185
+ '* "float32" for FP32 precision.')
186
+ parser.add_argument(
187
+ '--kv-cache-dtype',
188
+ type=str,
189
+ choices=['auto', 'fp8'],
190
+ default=EngineArgs.kv_cache_dtype,
191
+ help='Data type for kv cache storage. If "auto", will use model '
192
+ 'data type. FP8_E5M2 (without scaling) is only supported on cuda '
193
+ 'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
194
+ 'supported for common inference criteria.')
195
+ parser.add_argument(
196
+ '--quantization-param-path',
197
+ type=nullable_str,
198
+ default=None,
199
+ help='Path to the JSON file containing the KV cache '
200
+ 'scaling factors. This should generally be supplied, when '
201
+ 'KV cache dtype is FP8. Otherwise, KV cache scaling factors '
202
+ 'default to 1.0, which may cause accuracy issues. '
203
+ 'FP8_E5M2 (without scaling) is only supported on cuda version'
204
+ 'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
205
+ 'supported for common inference criteria.')
206
+ parser.add_argument('--max-model-len',
207
+ type=int,
208
+ default=EngineArgs.max_model_len,
209
+ help='Model context length. If unspecified, will '
210
+ 'be automatically derived from the model config.')
211
+ parser.add_argument(
212
+ '--guided-decoding-backend',
213
+ type=str,
214
+ default='outlines',
215
+ choices=['outlines', 'lm-format-enforcer'],
216
+ help='Which engine will be used for guided decoding'
217
+ ' (JSON schema / regex etc) by default. Currently support '
218
+ 'https://github.com/outlines-dev/outlines and '
219
+ 'https://github.com/noamgat/lm-format-enforcer.'
220
+ ' Can be overridden per request via guided_decoding_backend'
221
+ ' parameter.')
222
+ # Parallel arguments
223
+ parser.add_argument('--worker-use-ray',
224
+ action='store_true',
225
+ help='Use Ray for distributed serving, will be '
226
+ 'automatically set when using more than 1 GPU.')
227
+ parser.add_argument('--pipeline-parallel-size',
228
+ '-pp',
229
+ type=int,
230
+ default=EngineArgs.pipeline_parallel_size,
231
+ help='Number of pipeline stages.')
232
+ parser.add_argument('--tensor-parallel-size',
233
+ '-tp',
234
+ type=int,
235
+ default=EngineArgs.tensor_parallel_size,
236
+ help='Number of tensor parallel replicas.')
237
+ parser.add_argument(
238
+ '--max-parallel-loading-workers',
239
+ type=int,
240
+ default=EngineArgs.max_parallel_loading_workers,
241
+ help='Load model sequentially in multiple batches, '
242
+ 'to avoid RAM OOM when using tensor '
243
+ 'parallel and large models.')
244
+ parser.add_argument(
245
+ '--ray-workers-use-nsight',
246
+ action='store_true',
247
+ help='If specified, use nsight to profile Ray workers.')
248
+ # KV cache arguments
249
+ parser.add_argument('--block-size',
250
+ type=int,
251
+ default=EngineArgs.block_size,
252
+ choices=[8, 16, 32],
253
+ help='Token block size for contiguous chunks of '
254
+ 'tokens.')
255
+
256
+ parser.add_argument('--enable-prefix-caching',
257
+ action='store_true',
258
+ help='Enables automatic prefix caching.')
259
+ parser.add_argument('--use-v2-block-manager',
260
+ action='store_true',
261
+ help='Use BlockSpaceMangerV2.')
262
+ parser.add_argument(
263
+ '--num-lookahead-slots',
264
+ type=int,
265
+ default=EngineArgs.num_lookahead_slots,
266
+ help='Experimental scheduling config necessary for '
267
+ 'speculative decoding. This will be replaced by '
268
+ 'speculative config in the future; it is present '
269
+ 'to enable correctness tests until then.')
270
+
271
+ parser.add_argument('--seed',
272
+ type=int,
273
+ default=EngineArgs.seed,
274
+ help='Random seed for operations.')
275
+ parser.add_argument('--swap-space',
276
+ type=int,
277
+ default=EngineArgs.swap_space,
278
+ help='CPU swap space size (GiB) per GPU.')
279
+ parser.add_argument(
280
+ '--gpu-memory-utilization',
281
+ type=float,
282
+ default=EngineArgs.gpu_memory_utilization,
283
+ help='The fraction of GPU memory to be used for the model '
284
+ 'executor, which can range from 0 to 1. For example, a value of '
285
+ '0.5 would imply 50%% GPU memory utilization. If unspecified, '
286
+ 'will use the default value of 0.9.')
287
+ parser.add_argument(
288
+ '--num-gpu-blocks-override',
289
+ type=int,
290
+ default=None,
291
+ help='If specified, ignore GPU profiling result and use this number'
292
+ 'of GPU blocks. Used for testing preemption.')
293
+ parser.add_argument('--max-num-batched-tokens',
294
+ type=int,
295
+ default=EngineArgs.max_num_batched_tokens,
296
+ help='Maximum number of batched tokens per '
297
+ 'iteration.')
298
+ parser.add_argument('--max-num-seqs',
299
+ type=int,
300
+ default=EngineArgs.max_num_seqs,
301
+ help='Maximum number of sequences per iteration.')
302
+ parser.add_argument(
303
+ '--max-logprobs',
304
+ type=int,
305
+ default=EngineArgs.max_logprobs,
306
+ help=('Max number of log probs to return logprobs is specified in'
307
+ ' SamplingParams.'))
308
+ parser.add_argument('--disable-log-stats',
309
+ action='store_true',
310
+ help='Disable logging statistics.')
311
+ # Quantization settings.
312
+ parser.add_argument('--quantization',
313
+ '-q',
314
+ type=nullable_str,
315
+ choices=[*QUANTIZATION_METHODS, None],
316
+ default=EngineArgs.quantization,
317
+ help='Method used to quantize the weights. If '
318
+ 'None, we first check the `quantization_config` '
319
+ 'attribute in the model config file. If that is '
320
+ 'None, we assume the model weights are not '
321
+ 'quantized and use `dtype` to determine the data '
322
+ 'type of the weights.')
323
+ parser.add_argument('--enforce-eager',
324
+ action='store_true',
325
+ help='Always use eager-mode PyTorch. If False, '
326
+ 'will use eager mode and CUDA graph in hybrid '
327
+ 'for maximal performance and flexibility.')
328
+ parser.add_argument('--max-context-len-to-capture',
329
+ type=int,
330
+ default=EngineArgs.max_context_len_to_capture,
331
+ help='Maximum context length covered by CUDA '
332
+ 'graphs. When a sequence has context length '
333
+ 'larger than this, we fall back to eager mode. '
334
+ '(DEPRECATED. Use --max-seq_len-to-capture instead'
335
+ ')')
336
+ parser.add_argument('--max-seq_len-to-capture',
337
+ type=int,
338
+ default=EngineArgs.max_seq_len_to_capture,
339
+ help='Maximum sequence length covered by CUDA '
340
+ 'graphs. When a sequence has context length '
341
+ 'larger than this, we fall back to eager mode.')
342
+ parser.add_argument('--disable-custom-all-reduce',
343
+ action='store_true',
344
+ default=EngineArgs.disable_custom_all_reduce,
345
+ help='See ParallelConfig.')
346
+ parser.add_argument('--tokenizer-pool-size',
347
+ type=int,
348
+ default=EngineArgs.tokenizer_pool_size,
349
+ help='Size of tokenizer pool to use for '
350
+ 'asynchronous tokenization. If 0, will '
351
+ 'use synchronous tokenization.')
352
+ parser.add_argument('--tokenizer-pool-type',
353
+ type=str,
354
+ default=EngineArgs.tokenizer_pool_type,
355
+ help='Type of tokenizer pool to use for '
356
+ 'asynchronous tokenization. Ignored '
357
+ 'if tokenizer_pool_size is 0.')
358
+ parser.add_argument('--tokenizer-pool-extra-config',
359
+ type=nullable_str,
360
+ default=EngineArgs.tokenizer_pool_extra_config,
361
+ help='Extra config for tokenizer pool. '
362
+ 'This should be a JSON string that will be '
363
+ 'parsed into a dictionary. Ignored if '
364
+ 'tokenizer_pool_size is 0.')
365
+ # LoRA related configs
366
+ parser.add_argument('--enable-lora',
367
+ action='store_true',
368
+ help='If True, enable handling of LoRA adapters.')
369
+ parser.add_argument('--max-loras',
370
+ type=int,
371
+ default=EngineArgs.max_loras,
372
+ help='Max number of LoRAs in a single batch.')
373
+ parser.add_argument('--max-lora-rank',
374
+ type=int,
375
+ default=EngineArgs.max_lora_rank,
376
+ help='Max LoRA rank.')
377
+ parser.add_argument(
378
+ '--lora-extra-vocab-size',
379
+ type=int,
380
+ default=EngineArgs.lora_extra_vocab_size,
381
+ help=('Maximum size of extra vocabulary that can be '
382
+ 'present in a LoRA adapter (added to the base '
383
+ 'model vocabulary).'))
384
+ parser.add_argument(
385
+ '--lora-dtype',
386
+ type=str,
387
+ default=EngineArgs.lora_dtype,
388
+ choices=['auto', 'float16', 'bfloat16', 'float32'],
389
+ help=('Data type for LoRA. If auto, will default to '
390
+ 'base model dtype.'))
391
+ parser.add_argument(
392
+ '--max-cpu-loras',
393
+ type=int,
394
+ default=EngineArgs.max_cpu_loras,
395
+ help=('Maximum number of LoRAs to store in CPU memory. '
396
+ 'Must be >= than max_num_seqs. '
397
+ 'Defaults to max_num_seqs.'))
398
+ parser.add_argument(
399
+ '--fully-sharded-loras',
400
+ action='store_true',
401
+ help=('By default, only half of the LoRA computation is '
402
+ 'sharded with tensor parallelism. '
403
+ 'Enabling this will use the fully sharded layers. '
404
+ 'At high sequence length, max rank or '
405
+ 'tensor parallel size, this is likely faster.'))
406
+ parser.add_argument("--device",
407
+ type=str,
408
+ default=EngineArgs.device,
409
+ choices=["auto", "cuda", "neuron", "cpu"],
410
+ help='Device type for vLLM execution.')
411
+ # Related to Vision-language models such as llava
412
+ parser.add_argument(
413
+ '--image-input-type',
414
+ type=nullable_str,
415
+ default=None,
416
+ choices=[
417
+ t.name.lower() for t in VisionLanguageConfig.ImageInputType
418
+ ],
419
+ help=('The image input type passed into vLLM. '
420
+ 'Should be one of "pixel_values" or "image_features".'))
421
+ parser.add_argument('--image-token-id',
422
+ type=int,
423
+ default=None,
424
+ help=('Input id for image token.'))
425
+ parser.add_argument(
426
+ '--image-input-shape',
427
+ type=nullable_str,
428
+ default=None,
429
+ help=('The biggest image input shape (worst for memory footprint) '
430
+ 'given an input type. Only used for vLLM\'s profile_run.'))
431
+ parser.add_argument(
432
+ '--image-feature-size',
433
+ type=int,
434
+ default=None,
435
+ help=('The image feature size along the context dimension.'))
436
+ parser.add_argument(
437
+ '--scheduler-delay-factor',
438
+ type=float,
439
+ default=EngineArgs.scheduler_delay_factor,
440
+ help='Apply a delay (of delay factor multiplied by previous'
441
+ 'prompt latency) before scheduling next prompt.')
442
+ parser.add_argument(
443
+ '--enable-chunked-prefill',
444
+ action='store_true',
445
+ help='If set, the prefill requests can be chunked based on the '
446
+ 'max_num_batched_tokens.')
447
+
448
+ parser.add_argument(
449
+ '--speculative-model',
450
+ type=nullable_str,
451
+ default=EngineArgs.speculative_model,
452
+ help=
453
+ 'The name of the draft model to be used in speculative decoding.')
454
+
455
+ parser.add_argument(
456
+ '--num-speculative-tokens',
457
+ type=int,
458
+ default=EngineArgs.num_speculative_tokens,
459
+ help='The number of speculative tokens to sample from '
460
+ 'the draft model in speculative decoding.')
461
+
462
+ parser.add_argument(
463
+ '--speculative-max-model-len',
464
+ type=int,
465
+ default=EngineArgs.speculative_max_model_len,
466
+ help='The maximum sequence length supported by the '
467
+ 'draft model. Sequences over this length will skip '
468
+ 'speculation.')
469
+
470
+ parser.add_argument(
471
+ '--ngram-prompt-lookup-max',
472
+ type=int,
473
+ default=EngineArgs.ngram_prompt_lookup_max,
474
+ help='Max size of window for ngram prompt lookup in speculative '
475
+ 'decoding.')
476
+
477
+ parser.add_argument(
478
+ '--ngram-prompt-lookup-min',
479
+ type=int,
480
+ default=EngineArgs.ngram_prompt_lookup_min,
481
+ help='Min size of window for ngram prompt lookup in speculative '
482
+ 'decoding.')
483
+
484
+ parser.add_argument('--model-loader-extra-config',
485
+ type=nullable_str,
486
+ default=EngineArgs.model_loader_extra_config,
487
+ help='Extra config for model loader. '
488
+ 'This will be passed to the model loader '
489
+ 'corresponding to the chosen load_format. '
490
+ 'This should be a JSON string that will be '
491
+ 'parsed into a dictionary.')
492
+
493
+ parser.add_argument(
494
+ "--served-model-name",
495
+ nargs="+",
496
+ type=str,
497
+ default=None,
498
+ help="The model name(s) used in the API. If multiple "
499
+ "names are provided, the server will respond to any "
500
+ "of the provided names. The model name in the model "
501
+ "field of a response will be the first name in this "
502
+ "list. If not specified, the model name will be the "
503
+ "same as the `--model` argument. Noted that this name(s)"
504
+ "will also be used in `model_name` tag content of "
505
+ "prometheus metrics, if multiple names provided, metrics"
506
+ "tag will take the first one.")
507
+
508
+ return parser
509
+
510
+ @classmethod
511
+ def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
512
+ # Get the list of attributes of this dataclass.
513
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
514
+ # Set the attributes from the parsed arguments.
515
+ engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
516
+ return engine_args
517
+
518
+ def create_engine_config(self, ) -> EngineConfig:
519
+ device_config = DeviceConfig(self.device)
520
+ model_config = ModelConfig(
521
+ self.model, self.tokenizer, self.tokenizer_mode,
522
+ self.trust_remote_code, self.dtype, self.seed, self.revision,
523
+ self.code_revision, self.tokenizer_revision, self.max_model_len,
524
+ self.quantization, self.quantization_param_path,
525
+ self.enforce_eager, self.max_context_len_to_capture,
526
+ self.max_seq_len_to_capture, self.max_logprobs,
527
+ self.skip_tokenizer_init, self.served_model_name)
528
+ cache_config = CacheConfig(self.block_size,
529
+ self.gpu_memory_utilization,
530
+ self.swap_space, self.kv_cache_dtype,
531
+ self.num_gpu_blocks_override,
532
+ model_config.get_sliding_window(),
533
+ self.enable_prefix_caching)
534
+ parallel_config = ParallelConfig(
535
+ self.pipeline_parallel_size, self.tensor_parallel_size,
536
+ self.worker_use_ray, self.max_parallel_loading_workers,
537
+ self.disable_custom_all_reduce,
538
+ TokenizerPoolConfig.create_config(
539
+ self.tokenizer_pool_size,
540
+ self.tokenizer_pool_type,
541
+ self.tokenizer_pool_extra_config,
542
+ ), self.ray_workers_use_nsight)
543
+
544
+ speculative_config = SpeculativeConfig.maybe_create_spec_config(
545
+ target_model_config=model_config,
546
+ target_parallel_config=parallel_config,
547
+ target_dtype=self.dtype,
548
+ speculative_model=self.speculative_model,
549
+ num_speculative_tokens=self.num_speculative_tokens,
550
+ speculative_max_model_len=self.speculative_max_model_len,
551
+ enable_chunked_prefill=self.enable_chunked_prefill,
552
+ use_v2_block_manager=self.use_v2_block_manager,
553
+ ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
554
+ ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
555
+ )
556
+
557
+ scheduler_config = SchedulerConfig(
558
+ self.max_num_batched_tokens,
559
+ self.max_num_seqs,
560
+ model_config.max_model_len,
561
+ self.use_v2_block_manager,
562
+ num_lookahead_slots=(self.num_lookahead_slots
563
+ if speculative_config is None else
564
+ speculative_config.num_lookahead_slots),
565
+ delay_factor=self.scheduler_delay_factor,
566
+ enable_chunked_prefill=self.enable_chunked_prefill,
567
+ )
568
+ lora_config = LoRAConfig(
569
+ max_lora_rank=self.max_lora_rank,
570
+ max_loras=self.max_loras,
571
+ fully_sharded_loras=self.fully_sharded_loras,
572
+ lora_extra_vocab_size=self.lora_extra_vocab_size,
573
+ lora_dtype=self.lora_dtype,
574
+ max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
575
+ and self.max_cpu_loras > 0 else None) if self.enable_lora else None
576
+
577
+ load_config = LoadConfig(
578
+ load_format=self.load_format,
579
+ download_dir=self.download_dir,
580
+ model_loader_extra_config=self.model_loader_extra_config,
581
+ )
582
+
583
+ if self.image_input_type:
584
+ if (not self.image_token_id or not self.image_input_shape
585
+ or not self.image_feature_size):
586
+ raise ValueError(
587
+ 'Specify `image_token_id`, `image_input_shape` and '
588
+ '`image_feature_size` together with `image_input_type`.')
589
+ vision_language_config = VisionLanguageConfig(
590
+ image_input_type=VisionLanguageConfig.
591
+ get_image_input_enum_type(self.image_input_type),
592
+ image_token_id=self.image_token_id,
593
+ image_input_shape=str_to_int_tuple(self.image_input_shape),
594
+ image_feature_size=self.image_feature_size,
595
+ )
596
+ else:
597
+ vision_language_config = None
598
+
599
+ decoding_config = DecodingConfig(
600
+ guided_decoding_backend=self.guided_decoding_backend)
601
+
602
+ return EngineConfig(model_config=model_config,
603
+ cache_config=cache_config,
604
+ parallel_config=parallel_config,
605
+ scheduler_config=scheduler_config,
606
+ device_config=device_config,
607
+ lora_config=lora_config,
608
+ vision_language_config=vision_language_config,
609
+ speculative_config=speculative_config,
610
+ load_config=load_config,
611
+ decoding_config=decoding_config)
612
+
613
+
614
+ @dataclass
615
+ class AsyncEngineArgs(EngineArgs):
616
+ """Arguments for asynchronous vLLM engine."""
617
+ engine_use_ray: bool = False
618
+ disable_log_requests: bool = False
619
+ max_log_len: Optional[int] = None
620
+
621
+ @staticmethod
622
+ def add_cli_args(parser: argparse.ArgumentParser,
623
+ async_args_only: bool = False) -> argparse.ArgumentParser:
624
+ if not async_args_only:
625
+ parser = EngineArgs.add_cli_args(parser)
626
+ parser.add_argument('--engine-use-ray',
627
+ action='store_true',
628
+ help='Use Ray to start the LLM engine in a '
629
+ 'separate process as the server process.')
630
+ parser.add_argument('--disable-log-requests',
631
+ action='store_true',
632
+ help='Disable logging requests.')
633
+ parser.add_argument('--max-log-len',
634
+ type=int,
635
+ default=None,
636
+ help='Max number of prompt characters or prompt '
637
+ 'ID numbers being printed in log.'
638
+ '\n\nDefault: Unlimited')
639
+ return parser
640
+
641
+
642
+ # These functions are used by sphinx to build the documentation
643
+ def _engine_args_parser():
644
+ return EngineArgs.add_cli_args(argparse.ArgumentParser())
645
+
646
+
647
+ def _async_engine_args_parser():
648
+ return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
649
+ async_args_only=True)