vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,150 @@
1
+ from typing import Any, Dict, List, Optional, Set, Tuple
2
+
3
+ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
4
+ from vllm.logger import init_logger
5
+ from vllm.lora.request import LoRARequest
6
+ from vllm.sequence import ExecuteModelRequest, SamplerOutput
7
+ from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
8
+ make_async)
9
+ from vllm.worker.worker_base import WorkerWrapperBase
10
+
11
+ logger = init_logger(__name__)
12
+
13
+
14
+ class GPUExecutor(ExecutorBase):
15
+
16
+ def _init_executor(self) -> None:
17
+ """Initialize the worker and load the model.
18
+
19
+ If speculative decoding is enabled, we instead create the speculative
20
+ worker.
21
+ """
22
+ if self.speculative_config is None:
23
+ self._init_non_spec_worker()
24
+ else:
25
+ self._init_spec_worker()
26
+
27
+ def _get_worker_kwargs(
28
+ self,
29
+ local_rank: int = 0,
30
+ rank: int = 0,
31
+ distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
32
+ """Return worker init args for a given rank."""
33
+ if distributed_init_method is None:
34
+ distributed_init_method = get_distributed_init_method(
35
+ get_ip(), get_open_port())
36
+ return dict(
37
+ model_config=self.model_config,
38
+ parallel_config=self.parallel_config,
39
+ scheduler_config=self.scheduler_config,
40
+ device_config=self.device_config,
41
+ cache_config=self.cache_config,
42
+ load_config=self.load_config,
43
+ local_rank=local_rank,
44
+ rank=rank,
45
+ distributed_init_method=distributed_init_method,
46
+ lora_config=self.lora_config,
47
+ vision_language_config=self.vision_language_config,
48
+ is_driver_worker=rank == 0,
49
+ )
50
+
51
+ def _create_worker(self,
52
+ local_rank: int = 0,
53
+ rank: int = 0,
54
+ distributed_init_method: Optional[str] = None):
55
+ wrapper = WorkerWrapperBase(
56
+ worker_module_name="vllm.worker.worker",
57
+ worker_class_name="Worker",
58
+ )
59
+ wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
60
+ distributed_init_method))
61
+ return wrapper.worker
62
+
63
+ def _init_non_spec_worker(self):
64
+ assert self.parallel_config.world_size == 1, (
65
+ "GPUExecutor only supports single GPU.")
66
+
67
+ self.driver_worker = self._create_worker()
68
+ self.driver_worker.init_device()
69
+ self.driver_worker.load_model()
70
+
71
+ def _init_spec_worker(self):
72
+ """Initialize a SpecDecodeWorker, using a draft model for proposals.
73
+ """
74
+ assert self.speculative_config is not None
75
+
76
+ from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
77
+
78
+ target_worker = self._create_worker()
79
+
80
+ draft_worker_kwargs = self._get_worker_kwargs()
81
+ # Override draft-model specific worker args.
82
+ draft_worker_kwargs.update(
83
+ model_config=self.speculative_config.draft_model_config,
84
+ parallel_config=self.speculative_config.draft_parallel_config,
85
+ # TODO allow draft-model specific load config.
86
+ #load_config=self.load_config,
87
+ )
88
+
89
+ spec_decode_worker = SpecDecodeWorker.create_worker(
90
+ scorer_worker=target_worker,
91
+ draft_worker_kwargs=draft_worker_kwargs,
92
+ )
93
+
94
+ assert self.parallel_config.world_size == 1, (
95
+ "GPUExecutor only supports single GPU.")
96
+
97
+ self.driver_worker = spec_decode_worker
98
+
99
+ # Load model handled in spec decode worker.
100
+ self.driver_worker.init_device()
101
+
102
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
103
+ """Determine the number of available KV blocks by invoking the
104
+ underlying worker.
105
+ """
106
+ return self.driver_worker.determine_num_available_blocks()
107
+
108
+ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
109
+ """Initialize the KV cache by invoking the underlying worker.
110
+ """
111
+ # NOTE: This is logged in the executor because there can be >1 worker
112
+ # with other executors. We could log in the engine level, but work
113
+ # remains to abstract away the device for non-GPU configurations.
114
+ logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
115
+ num_cpu_blocks)
116
+
117
+ self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
118
+
119
+ def execute_model(
120
+ self,
121
+ execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
122
+ output = self.driver_worker.execute_model(execute_model_req)
123
+ return output
124
+
125
+ def add_lora(self, lora_request: LoRARequest) -> bool:
126
+ assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
127
+ return self.driver_worker.add_lora(lora_request)
128
+
129
+ def remove_lora(self, lora_id: int) -> bool:
130
+ assert lora_id > 0, "lora_id must be greater than 0."
131
+ return self.driver_worker.remove_lora(lora_id)
132
+
133
+ def list_loras(self) -> Set[int]:
134
+ return self.driver_worker.list_loras()
135
+
136
+ def check_health(self) -> None:
137
+ # GPUExecutor will always be healthy as long as
138
+ # it's running.
139
+ return
140
+
141
+
142
+ class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
143
+
144
+ async def execute_model_async(
145
+ self,
146
+ execute_model_req: ExecuteModelRequest,
147
+ ) -> List[SamplerOutput]:
148
+ output = await make_async(self.driver_worker.execute_model
149
+ )(execute_model_req=execute_model_req, )
150
+ return output
@@ -0,0 +1,263 @@
1
+ import asyncio
2
+ import multiprocessing
3
+ import os
4
+ import sys
5
+ import threading
6
+ import traceback
7
+ import uuid
8
+ from dataclasses import dataclass
9
+ from multiprocessing import Queue
10
+ from multiprocessing.connection import wait
11
+ from multiprocessing.process import BaseProcess
12
+ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
13
+ TypeVar, Union)
14
+
15
+ import vllm.envs as envs
16
+ from vllm.logger import init_logger
17
+
18
+ logger = init_logger(__name__)
19
+
20
+ T = TypeVar('T')
21
+
22
+ _TERMINATE = "TERMINATE" # sentinel
23
+
24
+ # ANSI color codes
25
+ CYAN = '\033[1;36m'
26
+ RESET = '\033[0;0m'
27
+
28
+ JOIN_TIMEOUT_S = 2
29
+
30
+ mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
31
+ mp = multiprocessing.get_context(mp_method)
32
+
33
+
34
+ @dataclass
35
+ class Result(Generic[T]):
36
+ """Result of task dispatched to worker"""
37
+
38
+ task_id: uuid.UUID
39
+ value: Optional[T] = None
40
+ exception: Optional[BaseException] = None
41
+
42
+
43
+ class ResultFuture(threading.Event, Generic[T]):
44
+ """Synchronous future for non-async case"""
45
+
46
+ def __init__(self):
47
+ super().__init__()
48
+ self.result: Optional[Result[T]] = None
49
+
50
+ def set_result(self, result: Result[T]):
51
+ self.result = result
52
+ self.set()
53
+
54
+ def get(self) -> T:
55
+ self.wait()
56
+ assert self.result is not None
57
+ if self.result.exception is not None:
58
+ raise self.result.exception
59
+ return self.result.value # type: ignore[return-value]
60
+
61
+
62
+ def _set_future_result(future: Union[ResultFuture, asyncio.Future],
63
+ result: Result):
64
+ if isinstance(future, ResultFuture):
65
+ future.set_result(result)
66
+ return
67
+ loop = future.get_loop()
68
+ if result.exception is not None:
69
+ loop.call_soon_threadsafe(future.set_exception, result.exception)
70
+ else:
71
+ loop.call_soon_threadsafe(future.set_result, result.value)
72
+
73
+
74
+ class ResultHandler(threading.Thread):
75
+ """Handle results from all workers (in background thread)"""
76
+
77
+ def __init__(self) -> None:
78
+ super().__init__(daemon=True)
79
+ self.result_queue = mp.Queue()
80
+ self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {}
81
+
82
+ def run(self):
83
+ for result in iter(self.result_queue.get, _TERMINATE):
84
+ future = self.tasks.pop(result.task_id)
85
+ _set_future_result(future, result)
86
+ # Ensure that all waiters will receive an exception
87
+ for task_id, future in self.tasks.items():
88
+ _set_future_result(
89
+ future,
90
+ Result(task_id=task_id,
91
+ exception=ChildProcessError("worker died")))
92
+
93
+ def close(self):
94
+ self.result_queue.put(_TERMINATE)
95
+
96
+
97
+ class WorkerMonitor(threading.Thread):
98
+ """Monitor worker status (in background thread)"""
99
+
100
+ def __init__(self, workers: List['ProcessWorkerWrapper'],
101
+ result_handler: ResultHandler):
102
+ super().__init__(daemon=True)
103
+ self.workers = workers
104
+ self.result_handler = result_handler
105
+ self._close = False
106
+
107
+ def run(self) -> None:
108
+ # Blocks until any worker exits
109
+ dead_sentinels = wait([w.process.sentinel for w in self.workers])
110
+ if not self._close:
111
+ self._close = True
112
+
113
+ # Kill / cleanup all workers
114
+ for worker in self.workers:
115
+ process = worker.process
116
+ if process.sentinel in dead_sentinels:
117
+ process.join(JOIN_TIMEOUT_S)
118
+ if process.exitcode is not None and process.exitcode != 0:
119
+ logger.error("Worker %s pid %s died, exit code: %s",
120
+ process.name, process.pid, process.exitcode)
121
+ # Cleanup any remaining workers
122
+ logger.info("Killing local vLLM worker processes")
123
+ for worker in self.workers:
124
+ worker.kill_worker()
125
+ # Must be done after worker task queues are all closed
126
+ self.result_handler.close()
127
+
128
+ for worker in self.workers:
129
+ worker.process.join(JOIN_TIMEOUT_S)
130
+
131
+ def close(self):
132
+ if self._close:
133
+ return
134
+ self._close = True
135
+ logger.info("Terminating local vLLM worker processes")
136
+ for worker in self.workers:
137
+ worker.terminate_worker()
138
+ # Must be done after worker task queues are all closed
139
+ self.result_handler.close()
140
+
141
+
142
+ class ProcessWorkerWrapper:
143
+ """Local process wrapper for vllm.worker.Worker,
144
+ for handling single-node multi-GPU tensor parallel."""
145
+
146
+ def __init__(self, result_handler: ResultHandler,
147
+ worker_factory: Callable[[], Any]) -> None:
148
+ self._task_queue = mp.Queue()
149
+ self.result_queue = result_handler.result_queue
150
+ self.tasks = result_handler.tasks
151
+ self.process: BaseProcess = mp.Process( # type: ignore[attr-defined]
152
+ target=_run_worker_process,
153
+ name="VllmWorkerProcess",
154
+ kwargs=dict(
155
+ worker_factory=worker_factory,
156
+ task_queue=self._task_queue,
157
+ result_queue=self.result_queue,
158
+ ),
159
+ daemon=True)
160
+
161
+ self.process.start()
162
+
163
+ def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
164
+ method: str, args, kwargs):
165
+ task_id = uuid.uuid4()
166
+ self.tasks[task_id] = future
167
+ try:
168
+ self._task_queue.put((task_id, method, args, kwargs))
169
+ except BaseException as e:
170
+ del self.tasks[task_id]
171
+ raise ChildProcessError("worker died") from e
172
+
173
+ def execute_method(self, method: str, *args, **kwargs):
174
+ future: ResultFuture = ResultFuture()
175
+ self._enqueue_task(future, method, args, kwargs)
176
+ return future
177
+
178
+ async def execute_method_async(self, method: str, *args, **kwargs):
179
+ future = asyncio.get_running_loop().create_future()
180
+ self._enqueue_task(future, method, args, kwargs)
181
+ return await future
182
+
183
+ def terminate_worker(self):
184
+ try:
185
+ self._task_queue.put(_TERMINATE)
186
+ except ValueError:
187
+ self.process.kill()
188
+ self._task_queue.close()
189
+
190
+ def kill_worker(self):
191
+ self._task_queue.close()
192
+ self.process.kill()
193
+
194
+
195
+ def _run_worker_process(
196
+ worker_factory: Callable[[], Any],
197
+ task_queue: Queue,
198
+ result_queue: Queue,
199
+ ) -> None:
200
+ """Worker process event loop"""
201
+
202
+ # Add process-specific prefix to stdout and stderr
203
+ process_name = mp.current_process().name
204
+ pid = os.getpid()
205
+ _add_prefix(sys.stdout, process_name, pid)
206
+ _add_prefix(sys.stderr, process_name, pid)
207
+
208
+ # Initialize worker
209
+ worker = worker_factory()
210
+ del worker_factory
211
+
212
+ # Accept tasks from the engine in task_queue
213
+ # and return task output in result_queue
214
+ logger.info("Worker ready; awaiting tasks")
215
+ try:
216
+ for items in iter(task_queue.get, _TERMINATE):
217
+ output = None
218
+ exception = None
219
+ task_id, method, args, kwargs = items
220
+ try:
221
+ executor = getattr(worker, method)
222
+ output = executor(*args, **kwargs)
223
+ except BaseException as e:
224
+ tb = traceback.format_exc()
225
+ logger.error(
226
+ "Exception in worker %s while processing method %s: %s, %s",
227
+ process_name, method, e, tb)
228
+ exception = e
229
+ result_queue.put(
230
+ Result(task_id=task_id, value=output, exception=exception))
231
+ except KeyboardInterrupt:
232
+ pass
233
+ except Exception:
234
+ logger.exception("Worker failed")
235
+
236
+ logger.info("Worker exiting")
237
+
238
+
239
+ def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
240
+ """Prepend each output line with process-specific prefix"""
241
+
242
+ prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
243
+ file_write = file.write
244
+
245
+ def write_with_prefix(s: str):
246
+ if not s:
247
+ return
248
+ if file.start_new_line: # type: ignore[attr-defined]
249
+ file_write(prefix)
250
+ idx = 0
251
+ while (next_idx := s.find('\n', idx)) != -1:
252
+ next_idx += 1
253
+ file_write(s[idx:next_idx])
254
+ if next_idx == len(s):
255
+ file.start_new_line = True # type: ignore[attr-defined]
256
+ return
257
+ file_write(prefix)
258
+ idx = next_idx
259
+ file_write(s[idx:])
260
+ file.start_new_line = False # type: ignore[attr-defined]
261
+
262
+ file.start_new_line = True # type: ignore[attr-defined]
263
+ file.write = write_with_prefix # type: ignore[method-assign]
@@ -0,0 +1,91 @@
1
+ from typing import List, Set, Tuple
2
+
3
+ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
4
+ from vllm.logger import init_logger
5
+ from vllm.lora.request import LoRARequest
6
+ from vllm.sequence import ExecuteModelRequest, SamplerOutput
7
+ from vllm.utils import make_async
8
+
9
+ logger = init_logger(__name__)
10
+
11
+
12
+ class NeuronExecutor(ExecutorBase):
13
+
14
+ def _init_executor(self) -> None:
15
+ assert (self.lora_config is
16
+ None), "LoRA is not supported for Neuron backend."
17
+ assert (not self.speculative_config
18
+ ), "Speculative decoding not yet supported for Neuron backend."
19
+
20
+ # Instantiate the worker and load the model to the device.
21
+ self._init_worker()
22
+
23
+ def _init_worker(self):
24
+ from vllm.worker.neuron_worker import NeuronWorker
25
+
26
+ self.driver_worker = NeuronWorker(
27
+ self.model_config,
28
+ self.parallel_config,
29
+ self.scheduler_config,
30
+ self.device_config,
31
+ self.cache_config,
32
+ )
33
+ self.driver_worker.init_device()
34
+ self.driver_worker.load_model()
35
+
36
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
37
+ """Determine the number of available KV blocks by invoking the
38
+ underlying worker.
39
+ """
40
+ return self.driver_worker.determine_num_available_blocks()
41
+
42
+ def initialize_cache(self, num_gpu_blocks: int,
43
+ num_cpu_blocks: int) -> None:
44
+ """Initialize the KV cache by invoking the underlying worker.
45
+ """
46
+ self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
47
+
48
+ def execute_model(
49
+ self,
50
+ execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
51
+ assert (execute_model_req.blocks_to_swap_in == {}
52
+ and execute_model_req.blocks_to_swap_out == {}
53
+ and execute_model_req.blocks_to_copy == {}), (
54
+ "Cache operations are not supported for Neuron backend.")
55
+ assert execute_model_req.num_lookahead_slots == 0, (
56
+ "lookahead not supported for Neuron backend.")
57
+
58
+ output = self.driver_worker.execute_model(
59
+ execute_model_req.seq_group_metadata_list)
60
+ return output
61
+
62
+ def add_lora(self, lora_request: LoRARequest) -> bool:
63
+ return self.driver_worker.add_lora(lora_request)
64
+
65
+ def remove_lora(self, lora_id: int) -> bool:
66
+ return self.driver_worker.remove_lora(lora_id)
67
+
68
+ def list_loras(self) -> Set[int]:
69
+ return self.driver_worker.list_loras()
70
+
71
+ def check_health(self) -> None:
72
+ # NeuronExecutor will always be healthy as long as
73
+ # it's running.
74
+ return
75
+
76
+
77
+ class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
78
+
79
+ async def execute_model_async(
80
+ self,
81
+ execute_model_req: ExecuteModelRequest,
82
+ ) -> List[SamplerOutput]:
83
+ output = await make_async(
84
+ self.driver_worker.execute_model
85
+ )(seq_group_metadata_list=execute_model_req.seq_group_metadata_list, )
86
+ return output
87
+
88
+ async def check_health_async(self) -> None:
89
+ # NeuronExecutor will always be healthy as long as
90
+ # it's running.
91
+ return