vllm-npu 0.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vllm/__init__.py +23 -0
- vllm/_custom_ops.py +251 -0
- vllm/attention/__init__.py +13 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +127 -0
- vllm/attention/backends/flash_attn.py +271 -0
- vllm/attention/backends/flashinfer.py +220 -0
- vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm/attention/backends/torch_sdpa.py +250 -0
- vllm/attention/backends/xformers.py +393 -0
- vllm/attention/layer.py +56 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/paged_attn.py +216 -0
- vllm/attention/ops/prefix_prefill.py +792 -0
- vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm/attention/selector.py +91 -0
- vllm/block.py +84 -0
- vllm/config.py +1225 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +295 -0
- vllm/core/block/common.py +199 -0
- vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm/core/block/interfaces.py +205 -0
- vllm/core/block/naive_block.py +318 -0
- vllm/core/block/prefix_caching_block.py +606 -0
- vllm/core/block_manager_v1.py +625 -0
- vllm/core/block_manager_v2.py +258 -0
- vllm/core/evictor_v1.py +105 -0
- vllm/core/evictor_v2.py +127 -0
- vllm/core/interfaces.py +113 -0
- vllm/core/policy.py +45 -0
- vllm/core/scheduler.py +1163 -0
- vllm/distributed/__init__.py +3 -0
- vllm/distributed/communication_op.py +237 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm/distributed/parallel_state.py +339 -0
- vllm/distributed/utils.py +136 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +649 -0
- vllm/engine/async_llm_engine.py +737 -0
- vllm/engine/llm_engine.py +784 -0
- vllm/engine/metrics.py +368 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +76 -0
- vllm/engine/output_processor/multi_step.py +142 -0
- vllm/engine/output_processor/single_step.py +284 -0
- vllm/engine/output_processor/stop_checker.py +101 -0
- vllm/engine/output_processor/util.py +19 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +119 -0
- vllm/entrypoints/llm.py +259 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +186 -0
- vllm/entrypoints/openai/cli_args.py +115 -0
- vllm/entrypoints/openai/protocol.py +460 -0
- vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm/envs.py +217 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/cpu_executor.py +152 -0
- vllm/executor/distributed_gpu_executor.py +115 -0
- vllm/executor/executor_base.py +115 -0
- vllm/executor/gpu_executor.py +150 -0
- vllm/executor/multiproc_worker_utils.py +263 -0
- vllm/executor/neuron_executor.py +91 -0
- vllm/executor/ray_gpu_executor.py +327 -0
- vllm/executor/ray_utils.py +119 -0
- vllm/logger.py +153 -0
- vllm/logging/__init__.py +5 -0
- vllm/logging/formatter.py +15 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +262 -0
- vllm/lora/layers.py +1181 -0
- vllm/lora/lora.py +167 -0
- vllm/lora/models.py +645 -0
- vllm/lora/punica.py +213 -0
- vllm/lora/request.py +32 -0
- vllm/lora/utils.py +98 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +7 -0
- vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +173 -0
- vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm/model_executor/layers/layernorm.py +71 -0
- vllm/model_executor/layers/linear.py +709 -0
- vllm/model_executor/layers/logits_processor.py +115 -0
- vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm/model_executor/layers/ops/rand.py +157 -0
- vllm/model_executor/layers/ops/sample.py +406 -0
- vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm/model_executor/layers/sampler.py +1051 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm/model_executor/model_loader/__init__.py +30 -0
- vllm/model_executor/model_loader/loader.py +362 -0
- vllm/model_executor/model_loader/neuron.py +136 -0
- vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm/model_executor/model_loader/utils.py +41 -0
- vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm/model_executor/models/__init__.py +119 -0
- vllm/model_executor/models/baichuan.py +410 -0
- vllm/model_executor/models/bloom.py +327 -0
- vllm/model_executor/models/chatglm.py +386 -0
- vllm/model_executor/models/commandr.py +373 -0
- vllm/model_executor/models/dbrx.py +413 -0
- vllm/model_executor/models/decilm.py +122 -0
- vllm/model_executor/models/deepseek.py +438 -0
- vllm/model_executor/models/falcon.py +444 -0
- vllm/model_executor/models/gemma.py +393 -0
- vllm/model_executor/models/gpt2.py +266 -0
- vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm/model_executor/models/gpt_j.py +281 -0
- vllm/model_executor/models/gpt_neox.py +295 -0
- vllm/model_executor/models/internlm2.py +323 -0
- vllm/model_executor/models/jais.py +333 -0
- vllm/model_executor/models/llama.py +442 -0
- vllm/model_executor/models/llava.py +239 -0
- vllm/model_executor/models/minicpm.py +531 -0
- vllm/model_executor/models/mixtral.py +583 -0
- vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm/model_executor/models/mpt.py +295 -0
- vllm/model_executor/models/olmo.py +356 -0
- vllm/model_executor/models/opt.py +349 -0
- vllm/model_executor/models/orion.py +319 -0
- vllm/model_executor/models/phi.py +300 -0
- vllm/model_executor/models/qwen.py +284 -0
- vllm/model_executor/models/qwen2.py +367 -0
- vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm/model_executor/models/stablelm.py +301 -0
- vllm/model_executor/models/starcoder2.py +302 -0
- vllm/model_executor/models/xverse.py +366 -0
- vllm/model_executor/sampling_metadata.py +588 -0
- vllm/model_executor/utils.py +35 -0
- vllm/outputs.py +150 -0
- vllm/py.typed +2 -0
- vllm/sampling_params.py +340 -0
- vllm/sequence.py +766 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +397 -0
- vllm/spec_decode/interfaces.py +73 -0
- vllm/spec_decode/metrics.py +191 -0
- vllm/spec_decode/multi_step_worker.py +203 -0
- vllm/spec_decode/ngram_worker.py +176 -0
- vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm/spec_decode/top1_proposer.py +200 -0
- vllm/spec_decode/util.py +228 -0
- vllm/test_utils.py +41 -0
- vllm/transformers_utils/__init__.py +0 -0
- vllm/transformers_utils/config.py +58 -0
- vllm/transformers_utils/configs/__init__.py +16 -0
- vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm/transformers_utils/configs/falcon.py +87 -0
- vllm/transformers_utils/configs/jais.py +236 -0
- vllm/transformers_utils/configs/mpt.py +178 -0
- vllm/transformers_utils/detokenizer.py +313 -0
- vllm/transformers_utils/tokenizer.py +149 -0
- vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +209 -0
- vllm/utils.py +677 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +105 -0
- vllm/worker/cpu_model_runner.py +346 -0
- vllm/worker/cpu_worker.py +321 -0
- vllm/worker/model_runner.py +1168 -0
- vllm/worker/neuron_model_runner.py +196 -0
- vllm/worker/neuron_worker.py +98 -0
- vllm/worker/worker.py +345 -0
- vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2.dist-info/LICENSE +201 -0
- vllm_npu-0.4.2.dist-info/METADATA +173 -0
- vllm_npu-0.4.2.dist-info/RECORD +219 -0
- vllm_npu-0.4.2.dist-info/WHEEL +5 -0
- vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,327 @@
|
|
1
|
+
import asyncio
|
2
|
+
import os
|
3
|
+
import pickle
|
4
|
+
from collections import defaultdict
|
5
|
+
from itertools import islice, repeat
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
7
|
+
|
8
|
+
import vllm.envs as envs
|
9
|
+
from vllm.executor.distributed_gpu_executor import ( # yapf: disable
|
10
|
+
DistributedGPUExecutor, DistributedGPUExecutorAsync)
|
11
|
+
from vllm.executor.ray_utils import RayWorkerWrapper, ray
|
12
|
+
from vllm.logger import init_logger
|
13
|
+
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
14
|
+
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
15
|
+
get_vllm_instance_id, make_async)
|
16
|
+
|
17
|
+
if ray is not None:
|
18
|
+
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
19
|
+
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
from ray.util.placement_group import PlacementGroup
|
22
|
+
|
23
|
+
logger = init_logger(__name__)
|
24
|
+
|
25
|
+
USE_RAY_COMPILED_DAG = envs.VLLM_USE_RAY_COMPILED_DAG
|
26
|
+
|
27
|
+
|
28
|
+
class RayGPUExecutor(DistributedGPUExecutor):
|
29
|
+
|
30
|
+
def _init_executor(self) -> None:
|
31
|
+
assert (not self.speculative_config
|
32
|
+
), "Speculative decoding not yet supported for RayGPU backend."
|
33
|
+
|
34
|
+
assert self.parallel_config.worker_use_ray
|
35
|
+
placement_group = self.parallel_config.placement_group
|
36
|
+
|
37
|
+
# Disable Ray usage stats collection.
|
38
|
+
ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
|
39
|
+
if ray_usage != "1":
|
40
|
+
os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
|
41
|
+
|
42
|
+
# Create the parallel GPU workers.
|
43
|
+
self._init_workers_ray(placement_group)
|
44
|
+
|
45
|
+
self.forward_dag = None
|
46
|
+
if USE_RAY_COMPILED_DAG:
|
47
|
+
self.forward_dag = self._compiled_ray_dag()
|
48
|
+
|
49
|
+
def _configure_ray_workers_use_nsight(self,
|
50
|
+
ray_remote_kwargs) -> Dict[str, Any]:
|
51
|
+
# If nsight profiling is enabled, we need to set the profiling
|
52
|
+
# configuration for the ray workers as runtime env.
|
53
|
+
runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
|
54
|
+
runtime_env.update({
|
55
|
+
"nsight": {
|
56
|
+
"t": "cuda,cudnn,cublas",
|
57
|
+
"o": "'worker_process_%p'",
|
58
|
+
"cuda-graph-trace": "node",
|
59
|
+
}
|
60
|
+
})
|
61
|
+
|
62
|
+
return ray_remote_kwargs
|
63
|
+
|
64
|
+
def _init_workers_ray(self, placement_group: "PlacementGroup",
|
65
|
+
**ray_remote_kwargs):
|
66
|
+
if self.parallel_config.tensor_parallel_size == 1:
|
67
|
+
# For single GPU case, we use a ray worker with constrained memory.
|
68
|
+
num_gpus = self.cache_config.gpu_memory_utilization
|
69
|
+
else:
|
70
|
+
# Otherwise, the ray workers are allocated with a full GPU.
|
71
|
+
num_gpus = 1
|
72
|
+
|
73
|
+
# The driver dummy worker does not actually use any resources.
|
74
|
+
# It holds the resource for the driver worker.
|
75
|
+
self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
|
76
|
+
# The remaining workers are the actual ray actors.
|
77
|
+
self.workers: List[RayWorkerWrapper] = []
|
78
|
+
|
79
|
+
if self.parallel_config.ray_workers_use_nsight:
|
80
|
+
ray_remote_kwargs = self._configure_ray_workers_use_nsight(
|
81
|
+
ray_remote_kwargs)
|
82
|
+
|
83
|
+
# Create the workers.
|
84
|
+
driver_ip = get_ip()
|
85
|
+
for bundle_id, bundle in enumerate(placement_group.bundle_specs):
|
86
|
+
if not bundle.get("GPU", 0):
|
87
|
+
continue
|
88
|
+
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
89
|
+
placement_group=placement_group,
|
90
|
+
placement_group_capture_child_tasks=True,
|
91
|
+
placement_group_bundle_index=bundle_id,
|
92
|
+
)
|
93
|
+
worker = ray.remote(
|
94
|
+
num_cpus=0,
|
95
|
+
num_gpus=num_gpus,
|
96
|
+
scheduling_strategy=scheduling_strategy,
|
97
|
+
**ray_remote_kwargs,
|
98
|
+
)(RayWorkerWrapper).remote(
|
99
|
+
worker_module_name="vllm.worker.worker",
|
100
|
+
worker_class_name="Worker",
|
101
|
+
trust_remote_code=self.model_config.trust_remote_code,
|
102
|
+
)
|
103
|
+
|
104
|
+
worker_ip = ray.get(worker.get_node_ip.remote())
|
105
|
+
if worker_ip == driver_ip and self.driver_dummy_worker is None:
|
106
|
+
# If the worker is on the same node as the driver, we use it
|
107
|
+
# as the resource holder for the driver process.
|
108
|
+
self.driver_dummy_worker = worker
|
109
|
+
self.driver_worker = RayWorkerWrapper(
|
110
|
+
worker_module_name="vllm.worker.worker",
|
111
|
+
worker_class_name="Worker",
|
112
|
+
trust_remote_code=self.model_config.trust_remote_code,
|
113
|
+
)
|
114
|
+
else:
|
115
|
+
# Else, added to the list of workers.
|
116
|
+
self.workers.append(worker)
|
117
|
+
|
118
|
+
if self.driver_dummy_worker is None:
|
119
|
+
raise ValueError(
|
120
|
+
"Ray does not allocate any GPUs on the driver node. Consider "
|
121
|
+
"adjusting the Ray placement group or running the driver on a "
|
122
|
+
"GPU node.")
|
123
|
+
|
124
|
+
# Get the set of GPU IDs used on each node.
|
125
|
+
worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
|
126
|
+
use_dummy_driver=True)
|
127
|
+
|
128
|
+
node_workers = defaultdict(list)
|
129
|
+
node_gpus = defaultdict(list)
|
130
|
+
|
131
|
+
for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
|
132
|
+
node_workers[node_id].append(i)
|
133
|
+
node_gpus[node_id].extend(gpu_ids)
|
134
|
+
for node_id, gpu_ids in node_gpus.items():
|
135
|
+
node_gpus[node_id] = sorted(gpu_ids)
|
136
|
+
|
137
|
+
VLLM_INSTANCE_ID = get_vllm_instance_id()
|
138
|
+
|
139
|
+
# Set environment variables for the driver and workers.
|
140
|
+
all_args_to_update_environment_variables = [({
|
141
|
+
"CUDA_VISIBLE_DEVICES":
|
142
|
+
",".join(map(str, node_gpus[node_id])),
|
143
|
+
"VLLM_INSTANCE_ID":
|
144
|
+
VLLM_INSTANCE_ID,
|
145
|
+
"VLLM_TRACE_FUNCTION":
|
146
|
+
str(envs.VLLM_TRACE_FUNCTION),
|
147
|
+
}, ) for (node_id, _) in worker_node_and_gpu_ids]
|
148
|
+
self._run_workers("update_environment_variables",
|
149
|
+
all_args=all_args_to_update_environment_variables)
|
150
|
+
|
151
|
+
distributed_init_method = get_distributed_init_method(
|
152
|
+
driver_ip, get_open_port())
|
153
|
+
|
154
|
+
# Initialize the actual workers inside worker wrapper.
|
155
|
+
init_worker_all_kwargs = [
|
156
|
+
self._get_worker_kwargs(
|
157
|
+
local_rank=node_workers[node_id].index(rank),
|
158
|
+
rank=rank,
|
159
|
+
distributed_init_method=distributed_init_method,
|
160
|
+
) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
|
161
|
+
]
|
162
|
+
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
|
163
|
+
|
164
|
+
self._run_workers("init_device")
|
165
|
+
self._run_workers("load_model",
|
166
|
+
max_concurrent_workers=self.parallel_config.
|
167
|
+
max_parallel_loading_workers)
|
168
|
+
|
169
|
+
def execute_model(
|
170
|
+
self,
|
171
|
+
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
172
|
+
all_outputs = self._run_workers(
|
173
|
+
"execute_model",
|
174
|
+
driver_kwargs={"execute_model_req": execute_model_req},
|
175
|
+
use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
|
176
|
+
|
177
|
+
# Only the driver worker returns the sampling results.
|
178
|
+
return all_outputs[0]
|
179
|
+
|
180
|
+
def _run_workers(
|
181
|
+
self,
|
182
|
+
method: str,
|
183
|
+
*args,
|
184
|
+
driver_args: Optional[Tuple[Any, ...]] = None,
|
185
|
+
driver_kwargs: Optional[Dict[str, Any]] = None,
|
186
|
+
all_args: Optional[List[Tuple[Any, ...]]] = None,
|
187
|
+
all_kwargs: Optional[List[Dict[str, Any]]] = None,
|
188
|
+
use_dummy_driver: bool = False,
|
189
|
+
max_concurrent_workers: Optional[int] = None,
|
190
|
+
use_ray_compiled_dag: bool = False,
|
191
|
+
**kwargs,
|
192
|
+
) -> Any:
|
193
|
+
"""Runs the given method on all workers. Can be used in the following
|
194
|
+
ways:
|
195
|
+
|
196
|
+
- args/kwargs: All workers share the same args/kwargs
|
197
|
+
- args/kwargs and driver_args/driver_kwargs: Driver worker has
|
198
|
+
different args
|
199
|
+
- all_args/all_kwargs: args/kwargs for each worker are specified
|
200
|
+
individually
|
201
|
+
"""
|
202
|
+
|
203
|
+
if max_concurrent_workers:
|
204
|
+
raise NotImplementedError(
|
205
|
+
"max_concurrent_workers is not supported yet.")
|
206
|
+
|
207
|
+
if driver_args is None:
|
208
|
+
driver_args = args if all_args is None else all_args[0]
|
209
|
+
if driver_kwargs is None:
|
210
|
+
driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
|
211
|
+
|
212
|
+
count = len(self.workers)
|
213
|
+
all_worker_args = repeat(args, count) if all_args is None \
|
214
|
+
else islice(all_args, 1, None)
|
215
|
+
all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
|
216
|
+
else islice(all_kwargs, 1, None)
|
217
|
+
|
218
|
+
if use_ray_compiled_dag:
|
219
|
+
# Right now, compiled DAG can only accept a single
|
220
|
+
# input. TODO(sang): Fix it.
|
221
|
+
assert self.forward_dag is not None
|
222
|
+
output_channels = self.forward_dag.execute(1)
|
223
|
+
else:
|
224
|
+
# Start the ray workers first.
|
225
|
+
ray_worker_outputs = [
|
226
|
+
worker.execute_method.remote(method, *worker_args,
|
227
|
+
**worker_kwargs)
|
228
|
+
for (worker, worker_args, worker_kwargs
|
229
|
+
) in zip(self.workers, all_worker_args, all_worker_kwargs)
|
230
|
+
]
|
231
|
+
|
232
|
+
# Start the driver worker after all the ray workers.
|
233
|
+
if not use_dummy_driver:
|
234
|
+
driver_worker_output = self.driver_worker.execute_method(
|
235
|
+
method, *driver_args, **driver_kwargs)
|
236
|
+
else:
|
237
|
+
assert self.driver_dummy_worker is not None
|
238
|
+
driver_worker_output = ray.get(
|
239
|
+
self.driver_dummy_worker.execute_method.remote(
|
240
|
+
method, *driver_args, **driver_kwargs))
|
241
|
+
# Get the results of the ray workers.
|
242
|
+
if self.workers:
|
243
|
+
if use_ray_compiled_dag:
|
244
|
+
try:
|
245
|
+
ray_worker_outputs = [
|
246
|
+
pickle.loads(chan.begin_read())
|
247
|
+
for chan in output_channels
|
248
|
+
]
|
249
|
+
finally:
|
250
|
+
# Has to call end_read in order to reuse the DAG.
|
251
|
+
for chan in output_channels:
|
252
|
+
chan.end_read()
|
253
|
+
else:
|
254
|
+
ray_worker_outputs = ray.get(ray_worker_outputs)
|
255
|
+
|
256
|
+
return [driver_worker_output] + ray_worker_outputs
|
257
|
+
|
258
|
+
def _compiled_ray_dag(self):
|
259
|
+
import pkg_resources
|
260
|
+
required_version = "2.9"
|
261
|
+
current_version = pkg_resources.get_distribution("ray").version
|
262
|
+
if current_version < required_version:
|
263
|
+
raise ValueError(f"Ray version {required_version} or greater is "
|
264
|
+
f"required, but found {current_version}")
|
265
|
+
|
266
|
+
from ray.dag import InputNode, MultiOutputNode
|
267
|
+
assert self.parallel_config.worker_use_ray
|
268
|
+
|
269
|
+
# Right now, compiled DAG requires at least 1 arg. We send
|
270
|
+
# a dummy value for now. It will be fixed soon.
|
271
|
+
with InputNode() as input_data:
|
272
|
+
forward_dag = MultiOutputNode([
|
273
|
+
worker.execute_model_compiled_dag_remote.
|
274
|
+
bind( # type: ignore[attr-defined]
|
275
|
+
input_data) for worker in self.workers
|
276
|
+
])
|
277
|
+
return forward_dag.experimental_compile()
|
278
|
+
|
279
|
+
def check_health(self) -> None:
|
280
|
+
"""Raises an error if engine is unhealthy."""
|
281
|
+
self._check_if_any_actor_is_dead()
|
282
|
+
|
283
|
+
def _check_if_any_actor_is_dead(self):
|
284
|
+
if not self.workers:
|
285
|
+
return
|
286
|
+
|
287
|
+
dead_actors = []
|
288
|
+
for actor in self.workers:
|
289
|
+
actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access
|
290
|
+
if actor_state["State"] == "DEAD":
|
291
|
+
dead_actors.append(actor)
|
292
|
+
if dead_actors:
|
293
|
+
raise RuntimeError("At least one Worker is dead. "
|
294
|
+
f"Dead Workers: {dead_actors}. ")
|
295
|
+
|
296
|
+
|
297
|
+
class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
|
298
|
+
|
299
|
+
def __init__(self, *args, **kwargs):
|
300
|
+
super().__init__(*args, **kwargs)
|
301
|
+
self.driver_executor = make_async(self.driver_worker.execute_method)
|
302
|
+
|
303
|
+
async def _run_workers_async(
|
304
|
+
self,
|
305
|
+
method: str,
|
306
|
+
*args,
|
307
|
+
driver_args: Optional[Tuple[Any, ...]] = None,
|
308
|
+
driver_kwargs: Optional[Dict[str, Any]] = None,
|
309
|
+
**kwargs,
|
310
|
+
) -> Any:
|
311
|
+
"""Runs the given method on all workers."""
|
312
|
+
coros = []
|
313
|
+
|
314
|
+
if driver_args is None:
|
315
|
+
driver_args = args
|
316
|
+
if driver_kwargs is None:
|
317
|
+
driver_kwargs = kwargs
|
318
|
+
|
319
|
+
coros.append(
|
320
|
+
self.driver_executor(method, *driver_args, **driver_kwargs))
|
321
|
+
|
322
|
+
# Run the ray workers asynchronously.
|
323
|
+
for worker in self.workers:
|
324
|
+
coros.append(worker.execute_method.remote(method, *args, **kwargs))
|
325
|
+
|
326
|
+
all_outputs = await asyncio.gather(*coros)
|
327
|
+
return all_outputs
|
@@ -0,0 +1,119 @@
|
|
1
|
+
import pickle
|
2
|
+
from typing import List, Optional, Tuple
|
3
|
+
|
4
|
+
from vllm.config import ParallelConfig
|
5
|
+
from vllm.logger import init_logger
|
6
|
+
from vllm.utils import get_ip, is_hip
|
7
|
+
from vllm.worker.worker_base import WorkerWrapperBase
|
8
|
+
|
9
|
+
logger = init_logger(__name__)
|
10
|
+
|
11
|
+
try:
|
12
|
+
import ray
|
13
|
+
|
14
|
+
class RayWorkerWrapper(WorkerWrapperBase):
|
15
|
+
"""Ray wrapper for vllm.worker.Worker, allowing Worker to be
|
16
|
+
lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
|
17
|
+
|
18
|
+
def __init__(self, *args, **kwargs) -> None:
|
19
|
+
super().__init__(*args, **kwargs)
|
20
|
+
# Since the compiled DAG runs a main execution
|
21
|
+
# in a different thread that calls cuda.set_device.
|
22
|
+
# The flag indicates is set_device is called on
|
23
|
+
# that thread.
|
24
|
+
self.compiled_dag_cuda_device_set = False
|
25
|
+
|
26
|
+
def get_node_ip(self) -> str:
|
27
|
+
return get_ip()
|
28
|
+
|
29
|
+
def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
|
30
|
+
node_id = ray.get_runtime_context().get_node_id()
|
31
|
+
gpu_ids = ray.get_gpu_ids()
|
32
|
+
return node_id, gpu_ids
|
33
|
+
|
34
|
+
def execute_model_compiled_dag_remote(self, ignored):
|
35
|
+
"""Used only when compiled DAG is enabled."""
|
36
|
+
import torch
|
37
|
+
if not self.compiled_dag_cuda_device_set:
|
38
|
+
torch.cuda.set_device(self.worker.device)
|
39
|
+
self.compiled_dag_cuda_device_set = True
|
40
|
+
|
41
|
+
output = self.worker.execute_model()
|
42
|
+
output = pickle.dumps(output)
|
43
|
+
return output
|
44
|
+
|
45
|
+
except ImportError as e:
|
46
|
+
logger.warning(
|
47
|
+
"Failed to import Ray with %r. For distributed inference, "
|
48
|
+
"please install Ray with `pip install ray`.", e)
|
49
|
+
ray = None # type: ignore
|
50
|
+
RayWorkerWrapper = None # type: ignore
|
51
|
+
|
52
|
+
|
53
|
+
def initialize_ray_cluster(
|
54
|
+
parallel_config: ParallelConfig,
|
55
|
+
ray_address: Optional[str] = None,
|
56
|
+
):
|
57
|
+
"""Initialize the distributed cluster with Ray.
|
58
|
+
|
59
|
+
it will connect to the Ray cluster and create a placement group
|
60
|
+
for the workers, which includes the specification of the resources
|
61
|
+
for each distributed worker.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
parallel_config: The configurations for parallel execution.
|
65
|
+
ray_address: The address of the Ray cluster. If None, uses
|
66
|
+
the default Ray cluster address.
|
67
|
+
"""
|
68
|
+
if ray is None:
|
69
|
+
raise ImportError(
|
70
|
+
"Ray is not installed. Please install Ray to use distributed "
|
71
|
+
"serving.")
|
72
|
+
|
73
|
+
# Connect to a ray cluster.
|
74
|
+
if is_hip():
|
75
|
+
ray.init(address=ray_address,
|
76
|
+
ignore_reinit_error=True,
|
77
|
+
num_gpus=parallel_config.world_size)
|
78
|
+
else:
|
79
|
+
ray.init(address=ray_address, ignore_reinit_error=True)
|
80
|
+
|
81
|
+
if parallel_config.placement_group:
|
82
|
+
# Placement group is already set.
|
83
|
+
return
|
84
|
+
|
85
|
+
# Create placement group for worker processes
|
86
|
+
current_placement_group = ray.util.get_current_placement_group()
|
87
|
+
if current_placement_group:
|
88
|
+
# We are in a placement group
|
89
|
+
bundles = current_placement_group.bundle_specs
|
90
|
+
# Verify that we can use the placement group.
|
91
|
+
gpu_bundles = 0
|
92
|
+
for bundle in bundles:
|
93
|
+
bundle_gpus = bundle.get("GPU", 0)
|
94
|
+
if bundle_gpus > 1:
|
95
|
+
raise ValueError(
|
96
|
+
"Placement group bundle cannot have more than 1 GPU.")
|
97
|
+
if bundle_gpus:
|
98
|
+
gpu_bundles += 1
|
99
|
+
if parallel_config.world_size > gpu_bundles:
|
100
|
+
raise ValueError(
|
101
|
+
"The number of required GPUs exceeds the total number of "
|
102
|
+
"available GPUs in the placement group.")
|
103
|
+
else:
|
104
|
+
num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
|
105
|
+
if parallel_config.world_size > num_gpus_in_cluster:
|
106
|
+
raise ValueError(
|
107
|
+
"The number of required GPUs exceeds the total number of "
|
108
|
+
"available GPUs in the cluster.")
|
109
|
+
# Create a new placement group
|
110
|
+
placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size)
|
111
|
+
current_placement_group = ray.util.placement_group(
|
112
|
+
placement_group_specs)
|
113
|
+
# Wait until PG is ready - this will block until all
|
114
|
+
# requested resources are available, and will timeout
|
115
|
+
# if they cannot be provisioned.
|
116
|
+
ray.get(current_placement_group.ready(), timeout=1800)
|
117
|
+
|
118
|
+
# Set the placement group in the parallel config
|
119
|
+
parallel_config.placement_group = current_placement_group
|
vllm/logger.py
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
"""Logging configuration for vLLM."""
|
2
|
+
import datetime
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from functools import partial
|
8
|
+
from logging import Logger
|
9
|
+
from logging.config import dictConfig
|
10
|
+
from os import path
|
11
|
+
from typing import Dict, Optional
|
12
|
+
|
13
|
+
import vllm.envs as envs
|
14
|
+
|
15
|
+
VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
|
16
|
+
VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
|
17
|
+
|
18
|
+
_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
|
19
|
+
_DATE_FORMAT = "%m-%d %H:%M:%S"
|
20
|
+
|
21
|
+
DEFAULT_LOGGING_CONFIG = {
|
22
|
+
"formatters": {
|
23
|
+
"vllm": {
|
24
|
+
"class": "vllm.logging.NewLineFormatter",
|
25
|
+
"datefmt": _DATE_FORMAT,
|
26
|
+
"format": _FORMAT,
|
27
|
+
},
|
28
|
+
},
|
29
|
+
"handlers": {
|
30
|
+
"vllm": {
|
31
|
+
"class": "logging.StreamHandler",
|
32
|
+
"formatter": "vllm",
|
33
|
+
"level": "INFO",
|
34
|
+
"stream": "ext://sys.stdout",
|
35
|
+
},
|
36
|
+
},
|
37
|
+
"loggers": {
|
38
|
+
"vllm": {
|
39
|
+
"handlers": ["vllm"],
|
40
|
+
"level": "DEBUG",
|
41
|
+
"propagate": False,
|
42
|
+
},
|
43
|
+
},
|
44
|
+
"version": 1,
|
45
|
+
}
|
46
|
+
|
47
|
+
|
48
|
+
def _configure_vllm_root_logger() -> None:
|
49
|
+
logging_config: Optional[Dict] = None
|
50
|
+
|
51
|
+
if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
|
52
|
+
raise RuntimeError(
|
53
|
+
"VLLM_CONFIGURE_LOGGING evaluated to false, but "
|
54
|
+
"VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
|
55
|
+
"implies VLLM_CONFIGURE_LOGGING. Please enable "
|
56
|
+
"VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.")
|
57
|
+
|
58
|
+
if VLLM_CONFIGURE_LOGGING:
|
59
|
+
logging_config = DEFAULT_LOGGING_CONFIG
|
60
|
+
|
61
|
+
if VLLM_LOGGING_CONFIG_PATH:
|
62
|
+
if not path.exists(VLLM_LOGGING_CONFIG_PATH):
|
63
|
+
raise RuntimeError(
|
64
|
+
"Could not load logging config. File does not exist: %s",
|
65
|
+
VLLM_LOGGING_CONFIG_PATH)
|
66
|
+
with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8",
|
67
|
+
mode="r") as file:
|
68
|
+
custom_config = json.loads(file.read())
|
69
|
+
|
70
|
+
if not isinstance(custom_config, dict):
|
71
|
+
raise ValueError("Invalid logging config. Expected Dict, got %s.",
|
72
|
+
type(custom_config).__name__)
|
73
|
+
logging_config = custom_config
|
74
|
+
|
75
|
+
if logging_config:
|
76
|
+
dictConfig(logging_config)
|
77
|
+
|
78
|
+
|
79
|
+
def init_logger(name: str) -> Logger:
|
80
|
+
"""The main purpose of this function is to ensure that loggers are
|
81
|
+
retrieved in such a way that we can be sure the root vllm logger has
|
82
|
+
already been configured."""
|
83
|
+
|
84
|
+
return logging.getLogger(name)
|
85
|
+
|
86
|
+
|
87
|
+
# The root logger is initialized when the module is imported.
|
88
|
+
# This is thread-safe as the module is only imported once,
|
89
|
+
# guaranteed by the Python GIL.
|
90
|
+
_configure_vllm_root_logger()
|
91
|
+
|
92
|
+
logger = init_logger(__name__)
|
93
|
+
|
94
|
+
|
95
|
+
def _trace_calls(log_path, root_dir, frame, event, arg=None):
|
96
|
+
if event in ['call', 'return']:
|
97
|
+
# Extract the filename, line number, function name, and the code object
|
98
|
+
filename = frame.f_code.co_filename
|
99
|
+
lineno = frame.f_lineno
|
100
|
+
func_name = frame.f_code.co_name
|
101
|
+
if not filename.startswith(root_dir):
|
102
|
+
# only log the functions in the vllm root_dir
|
103
|
+
return
|
104
|
+
# Log every function call or return
|
105
|
+
try:
|
106
|
+
last_frame = frame.f_back
|
107
|
+
if last_frame is not None:
|
108
|
+
last_filename = last_frame.f_code.co_filename
|
109
|
+
last_lineno = last_frame.f_lineno
|
110
|
+
last_func_name = last_frame.f_code.co_name
|
111
|
+
else:
|
112
|
+
# initial frame
|
113
|
+
last_filename = ""
|
114
|
+
last_lineno = 0
|
115
|
+
last_func_name = ""
|
116
|
+
with open(log_path, 'a') as f:
|
117
|
+
if event == 'call':
|
118
|
+
f.write(f"{datetime.datetime.now()} Call to"
|
119
|
+
f" {func_name} in {filename}:{lineno}"
|
120
|
+
f" from {last_func_name} in {last_filename}:"
|
121
|
+
f"{last_lineno}\n")
|
122
|
+
else:
|
123
|
+
f.write(f"{datetime.datetime.now()} Return from"
|
124
|
+
f" {func_name} in {filename}:{lineno}"
|
125
|
+
f" to {last_func_name} in {last_filename}:"
|
126
|
+
f"{last_lineno}\n")
|
127
|
+
except NameError:
|
128
|
+
# modules are deleted during shutdown
|
129
|
+
pass
|
130
|
+
return partial(_trace_calls, log_path, root_dir)
|
131
|
+
|
132
|
+
|
133
|
+
def enable_trace_function_call(log_file_path: str,
|
134
|
+
root_dir: Optional[str] = None):
|
135
|
+
"""
|
136
|
+
Enable tracing of every function call in code under `root_dir`.
|
137
|
+
This is useful for debugging hangs or crashes.
|
138
|
+
`log_file_path` is the path to the log file.
|
139
|
+
`root_dir` is the root directory of the code to trace. If None, it is the
|
140
|
+
vllm root directory.
|
141
|
+
|
142
|
+
Note that this call is thread-level, any threads calling this function
|
143
|
+
will have the trace enabled. Other threads will not be affected.
|
144
|
+
"""
|
145
|
+
logger.warning(
|
146
|
+
"VLLM_TRACE_FUNCTION is enabled. It will record every"
|
147
|
+
" function executed by Python. This will slow down the code. It "
|
148
|
+
"is suggested to be used for debugging hang or crashes only.")
|
149
|
+
logger.info("Trace frame log is saved to %s", log_file_path)
|
150
|
+
if root_dir is None:
|
151
|
+
# by default, this is the vllm root directory
|
152
|
+
root_dir = os.path.dirname(os.path.dirname(__file__))
|
153
|
+
sys.settrace(partial(_trace_calls, log_file_path, root_dir))
|
vllm/logging/__init__.py
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
|
4
|
+
class NewLineFormatter(logging.Formatter):
|
5
|
+
"""Adds logging prefix to newlines to align multi-line messages."""
|
6
|
+
|
7
|
+
def __init__(self, fmt, datefmt=None, style="%"):
|
8
|
+
logging.Formatter.__init__(self, fmt, datefmt, style)
|
9
|
+
|
10
|
+
def format(self, record):
|
11
|
+
msg = logging.Formatter.format(self, record)
|
12
|
+
if record.message != "":
|
13
|
+
parts = msg.split(record.message)
|
14
|
+
msg = msg.replace("\n", "\r\n" + parts[0])
|
15
|
+
return msg
|
vllm/lora/__init__.py
ADDED
File without changes
|