vllm-npu 0.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vllm/__init__.py +23 -0
- vllm/_custom_ops.py +251 -0
- vllm/attention/__init__.py +13 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +127 -0
- vllm/attention/backends/flash_attn.py +271 -0
- vllm/attention/backends/flashinfer.py +220 -0
- vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm/attention/backends/torch_sdpa.py +250 -0
- vllm/attention/backends/xformers.py +393 -0
- vllm/attention/layer.py +56 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/paged_attn.py +216 -0
- vllm/attention/ops/prefix_prefill.py +792 -0
- vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm/attention/selector.py +91 -0
- vllm/block.py +84 -0
- vllm/config.py +1225 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +295 -0
- vllm/core/block/common.py +199 -0
- vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm/core/block/interfaces.py +205 -0
- vllm/core/block/naive_block.py +318 -0
- vllm/core/block/prefix_caching_block.py +606 -0
- vllm/core/block_manager_v1.py +625 -0
- vllm/core/block_manager_v2.py +258 -0
- vllm/core/evictor_v1.py +105 -0
- vllm/core/evictor_v2.py +127 -0
- vllm/core/interfaces.py +113 -0
- vllm/core/policy.py +45 -0
- vllm/core/scheduler.py +1163 -0
- vllm/distributed/__init__.py +3 -0
- vllm/distributed/communication_op.py +237 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm/distributed/parallel_state.py +339 -0
- vllm/distributed/utils.py +136 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +649 -0
- vllm/engine/async_llm_engine.py +737 -0
- vllm/engine/llm_engine.py +784 -0
- vllm/engine/metrics.py +368 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +76 -0
- vllm/engine/output_processor/multi_step.py +142 -0
- vllm/engine/output_processor/single_step.py +284 -0
- vllm/engine/output_processor/stop_checker.py +101 -0
- vllm/engine/output_processor/util.py +19 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +119 -0
- vllm/entrypoints/llm.py +259 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +186 -0
- vllm/entrypoints/openai/cli_args.py +115 -0
- vllm/entrypoints/openai/protocol.py +460 -0
- vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm/envs.py +217 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/cpu_executor.py +152 -0
- vllm/executor/distributed_gpu_executor.py +115 -0
- vllm/executor/executor_base.py +115 -0
- vllm/executor/gpu_executor.py +150 -0
- vllm/executor/multiproc_worker_utils.py +263 -0
- vllm/executor/neuron_executor.py +91 -0
- vllm/executor/ray_gpu_executor.py +327 -0
- vllm/executor/ray_utils.py +119 -0
- vllm/logger.py +153 -0
- vllm/logging/__init__.py +5 -0
- vllm/logging/formatter.py +15 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +262 -0
- vllm/lora/layers.py +1181 -0
- vllm/lora/lora.py +167 -0
- vllm/lora/models.py +645 -0
- vllm/lora/punica.py +213 -0
- vllm/lora/request.py +32 -0
- vllm/lora/utils.py +98 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +7 -0
- vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +173 -0
- vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm/model_executor/layers/layernorm.py +71 -0
- vllm/model_executor/layers/linear.py +709 -0
- vllm/model_executor/layers/logits_processor.py +115 -0
- vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm/model_executor/layers/ops/rand.py +157 -0
- vllm/model_executor/layers/ops/sample.py +406 -0
- vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm/model_executor/layers/sampler.py +1051 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm/model_executor/model_loader/__init__.py +30 -0
- vllm/model_executor/model_loader/loader.py +362 -0
- vllm/model_executor/model_loader/neuron.py +136 -0
- vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm/model_executor/model_loader/utils.py +41 -0
- vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm/model_executor/models/__init__.py +119 -0
- vllm/model_executor/models/baichuan.py +410 -0
- vllm/model_executor/models/bloom.py +327 -0
- vllm/model_executor/models/chatglm.py +386 -0
- vllm/model_executor/models/commandr.py +373 -0
- vllm/model_executor/models/dbrx.py +413 -0
- vllm/model_executor/models/decilm.py +122 -0
- vllm/model_executor/models/deepseek.py +438 -0
- vllm/model_executor/models/falcon.py +444 -0
- vllm/model_executor/models/gemma.py +393 -0
- vllm/model_executor/models/gpt2.py +266 -0
- vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm/model_executor/models/gpt_j.py +281 -0
- vllm/model_executor/models/gpt_neox.py +295 -0
- vllm/model_executor/models/internlm2.py +323 -0
- vllm/model_executor/models/jais.py +333 -0
- vllm/model_executor/models/llama.py +442 -0
- vllm/model_executor/models/llava.py +239 -0
- vllm/model_executor/models/minicpm.py +531 -0
- vllm/model_executor/models/mixtral.py +583 -0
- vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm/model_executor/models/mpt.py +295 -0
- vllm/model_executor/models/olmo.py +356 -0
- vllm/model_executor/models/opt.py +349 -0
- vllm/model_executor/models/orion.py +319 -0
- vllm/model_executor/models/phi.py +300 -0
- vllm/model_executor/models/qwen.py +284 -0
- vllm/model_executor/models/qwen2.py +367 -0
- vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm/model_executor/models/stablelm.py +301 -0
- vllm/model_executor/models/starcoder2.py +302 -0
- vllm/model_executor/models/xverse.py +366 -0
- vllm/model_executor/sampling_metadata.py +588 -0
- vllm/model_executor/utils.py +35 -0
- vllm/outputs.py +150 -0
- vllm/py.typed +2 -0
- vllm/sampling_params.py +340 -0
- vllm/sequence.py +766 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +397 -0
- vllm/spec_decode/interfaces.py +73 -0
- vllm/spec_decode/metrics.py +191 -0
- vllm/spec_decode/multi_step_worker.py +203 -0
- vllm/spec_decode/ngram_worker.py +176 -0
- vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm/spec_decode/top1_proposer.py +200 -0
- vllm/spec_decode/util.py +228 -0
- vllm/test_utils.py +41 -0
- vllm/transformers_utils/__init__.py +0 -0
- vllm/transformers_utils/config.py +58 -0
- vllm/transformers_utils/configs/__init__.py +16 -0
- vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm/transformers_utils/configs/falcon.py +87 -0
- vllm/transformers_utils/configs/jais.py +236 -0
- vllm/transformers_utils/configs/mpt.py +178 -0
- vllm/transformers_utils/detokenizer.py +313 -0
- vllm/transformers_utils/tokenizer.py +149 -0
- vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +209 -0
- vllm/utils.py +677 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +105 -0
- vllm/worker/cpu_model_runner.py +346 -0
- vllm/worker/cpu_worker.py +321 -0
- vllm/worker/model_runner.py +1168 -0
- vllm/worker/neuron_model_runner.py +196 -0
- vllm/worker/neuron_worker.py +98 -0
- vllm/worker/worker.py +345 -0
- vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2.dist-info/LICENSE +201 -0
- vllm_npu-0.4.2.dist-info/METADATA +173 -0
- vllm_npu-0.4.2.dist-info/RECORD +219 -0
- vllm_npu-0.4.2.dist-info/WHEEL +5 -0
- vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,784 @@
|
|
1
|
+
import time
|
2
|
+
from typing import Iterable, List, Optional, Type, Union
|
3
|
+
|
4
|
+
from transformers import GenerationConfig, PreTrainedTokenizer
|
5
|
+
|
6
|
+
import vllm
|
7
|
+
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
|
8
|
+
LoRAConfig, ModelConfig, ParallelConfig,
|
9
|
+
SchedulerConfig, SpeculativeConfig,
|
10
|
+
VisionLanguageConfig)
|
11
|
+
from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
|
12
|
+
SchedulerOutputs)
|
13
|
+
from vllm.engine.arg_utils import EngineArgs
|
14
|
+
from vllm.engine.metrics import StatLogger, Stats
|
15
|
+
from vllm.engine.output_processor.interfaces import (
|
16
|
+
SequenceGroupOutputProcessor)
|
17
|
+
from vllm.engine.output_processor.stop_checker import StopChecker
|
18
|
+
from vllm.engine.output_processor.util import create_output_by_sequence_group
|
19
|
+
from vllm.executor.executor_base import ExecutorBase
|
20
|
+
from vllm.executor.ray_utils import initialize_ray_cluster
|
21
|
+
from vllm.logger import init_logger
|
22
|
+
from vllm.lora.request import LoRARequest
|
23
|
+
from vllm.outputs import RequestOutput
|
24
|
+
from vllm.sampling_params import SamplingParams
|
25
|
+
from vllm.sequence import (ExecuteModelRequest, MultiModalData, SamplerOutput,
|
26
|
+
Sequence, SequenceGroup, SequenceGroupMetadata,
|
27
|
+
SequenceStatus)
|
28
|
+
from vllm.transformers_utils.detokenizer import Detokenizer
|
29
|
+
from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
|
30
|
+
get_tokenizer_group)
|
31
|
+
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
|
32
|
+
usage_message)
|
33
|
+
from vllm.utils import Counter
|
34
|
+
|
35
|
+
logger = init_logger(__name__)
|
36
|
+
_LOCAL_LOGGING_INTERVAL_SEC = 5
|
37
|
+
|
38
|
+
|
39
|
+
def _load_generation_config_dict(model_config: ModelConfig):
|
40
|
+
try:
|
41
|
+
return GenerationConfig.from_pretrained(
|
42
|
+
model_config.model,
|
43
|
+
revision=model_config.revision,
|
44
|
+
).to_diff_dict()
|
45
|
+
except OSError:
|
46
|
+
# Not found.
|
47
|
+
return {}
|
48
|
+
|
49
|
+
|
50
|
+
class LLMEngine:
|
51
|
+
"""An LLM engine that receives requests and generates texts.
|
52
|
+
|
53
|
+
This is the main class for the vLLM engine. It receives requests
|
54
|
+
from clients and generates texts from the LLM. It includes a tokenizer, a
|
55
|
+
language model (possibly distributed across multiple GPUs), and GPU memory
|
56
|
+
space allocated for intermediate states (aka KV cache). This class utilizes
|
57
|
+
iteration-level scheduling and efficient memory management to maximize the
|
58
|
+
serving throughput.
|
59
|
+
|
60
|
+
The `LLM` class wraps this class for offline batched inference and the
|
61
|
+
`AsyncLLMEngine` class wraps this class for online serving.
|
62
|
+
|
63
|
+
NOTE: The config arguments are derived from the `EngineArgs` class. For the
|
64
|
+
comprehensive list of arguments, see `EngineArgs`.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
model_config: The configuration related to the LLM model.
|
68
|
+
cache_config: The configuration related to the KV cache memory
|
69
|
+
management.
|
70
|
+
parallel_config: The configuration related to distributed execution.
|
71
|
+
scheduler_config: The configuration related to the request scheduler.
|
72
|
+
device_config: The configuration related to the device.
|
73
|
+
lora_config (Optional): The configuration related to serving multi-LoRA.
|
74
|
+
vision_language_config (Optional): The configuration related to vision
|
75
|
+
language models.
|
76
|
+
speculative_config (Optional): The configuration related to speculative
|
77
|
+
decoding.
|
78
|
+
executor_class: The model executor class for managing distributed
|
79
|
+
execution.
|
80
|
+
log_stats: Whether to log statistics.
|
81
|
+
usage_context: Specified entry point, used for usage info collection
|
82
|
+
"""
|
83
|
+
|
84
|
+
def __init__(
|
85
|
+
self,
|
86
|
+
model_config: ModelConfig,
|
87
|
+
cache_config: CacheConfig,
|
88
|
+
parallel_config: ParallelConfig,
|
89
|
+
scheduler_config: SchedulerConfig,
|
90
|
+
device_config: DeviceConfig,
|
91
|
+
load_config: LoadConfig,
|
92
|
+
lora_config: Optional[LoRAConfig],
|
93
|
+
vision_language_config: Optional[VisionLanguageConfig],
|
94
|
+
speculative_config: Optional[SpeculativeConfig],
|
95
|
+
decoding_config: Optional[DecodingConfig],
|
96
|
+
executor_class: Type[ExecutorBase],
|
97
|
+
log_stats: bool,
|
98
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
99
|
+
) -> None:
|
100
|
+
logger.info(
|
101
|
+
"Initializing an LLM engine (v%s) with config: "
|
102
|
+
"model=%r, speculative_config=%r, tokenizer=%r, "
|
103
|
+
"skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
|
104
|
+
"tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
|
105
|
+
"max_seq_len=%d, download_dir=%r, load_format=%s, "
|
106
|
+
"tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
|
107
|
+
"quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
|
108
|
+
"quantization_param_path=%s, device_config=%s, "
|
109
|
+
"decoding_config=%r, seed=%d, served_model_name=%s)",
|
110
|
+
vllm.__version__,
|
111
|
+
model_config.model,
|
112
|
+
speculative_config,
|
113
|
+
model_config.tokenizer,
|
114
|
+
model_config.skip_tokenizer_init,
|
115
|
+
model_config.tokenizer_mode,
|
116
|
+
model_config.revision,
|
117
|
+
model_config.tokenizer_revision,
|
118
|
+
model_config.trust_remote_code,
|
119
|
+
model_config.dtype,
|
120
|
+
model_config.max_model_len,
|
121
|
+
load_config.download_dir,
|
122
|
+
load_config.load_format,
|
123
|
+
parallel_config.tensor_parallel_size,
|
124
|
+
parallel_config.disable_custom_all_reduce,
|
125
|
+
model_config.quantization,
|
126
|
+
model_config.enforce_eager,
|
127
|
+
cache_config.cache_dtype,
|
128
|
+
model_config.quantization_param_path,
|
129
|
+
device_config.device,
|
130
|
+
decoding_config,
|
131
|
+
model_config.seed,
|
132
|
+
model_config.served_model_name,
|
133
|
+
)
|
134
|
+
# TODO(woosuk): Print more configs in debug mode.
|
135
|
+
|
136
|
+
self.model_config = model_config
|
137
|
+
self.cache_config = cache_config
|
138
|
+
self.lora_config = lora_config
|
139
|
+
self.vision_language_config = vision_language_config
|
140
|
+
self.parallel_config = parallel_config
|
141
|
+
self.scheduler_config = scheduler_config
|
142
|
+
self.device_config = device_config
|
143
|
+
self.speculative_config = speculative_config
|
144
|
+
self.load_config = load_config
|
145
|
+
self.decoding_config = decoding_config or DecodingConfig()
|
146
|
+
self.log_stats = log_stats
|
147
|
+
|
148
|
+
if not self.model_config.skip_tokenizer_init:
|
149
|
+
self.tokenizer: BaseTokenizerGroup
|
150
|
+
self._init_tokenizer()
|
151
|
+
self.detokenizer = Detokenizer(self.tokenizer)
|
152
|
+
else:
|
153
|
+
self.detokenizer = None
|
154
|
+
self.tokenizer = None
|
155
|
+
|
156
|
+
self.seq_counter = Counter()
|
157
|
+
self.generation_config_fields = _load_generation_config_dict(
|
158
|
+
model_config)
|
159
|
+
|
160
|
+
self.model_executor = executor_class(
|
161
|
+
model_config=model_config,
|
162
|
+
cache_config=cache_config,
|
163
|
+
parallel_config=parallel_config,
|
164
|
+
scheduler_config=scheduler_config,
|
165
|
+
device_config=device_config,
|
166
|
+
lora_config=lora_config,
|
167
|
+
vision_language_config=vision_language_config,
|
168
|
+
speculative_config=speculative_config,
|
169
|
+
load_config=load_config,
|
170
|
+
)
|
171
|
+
|
172
|
+
self._initialize_kv_caches()
|
173
|
+
|
174
|
+
# If usage stat is enabled, collect relevant info.
|
175
|
+
if is_usage_stats_enabled():
|
176
|
+
from vllm.model_executor.model_loader import (
|
177
|
+
get_architecture_class_name)
|
178
|
+
usage_message.report_usage(
|
179
|
+
get_architecture_class_name(model_config),
|
180
|
+
usage_context,
|
181
|
+
extra_kvs={
|
182
|
+
# Common configuration
|
183
|
+
"dtype":
|
184
|
+
str(model_config.dtype),
|
185
|
+
"tensor_parallel_size":
|
186
|
+
parallel_config.tensor_parallel_size,
|
187
|
+
"block_size":
|
188
|
+
cache_config.block_size,
|
189
|
+
"gpu_memory_utilization":
|
190
|
+
cache_config.gpu_memory_utilization,
|
191
|
+
|
192
|
+
# Quantization
|
193
|
+
"quantization":
|
194
|
+
model_config.quantization,
|
195
|
+
"kv_cache_dtype":
|
196
|
+
cache_config.cache_dtype,
|
197
|
+
|
198
|
+
# Feature flags
|
199
|
+
"enable_lora":
|
200
|
+
bool(lora_config),
|
201
|
+
"enable_prefix_caching":
|
202
|
+
cache_config.enable_prefix_caching,
|
203
|
+
"enforce_eager":
|
204
|
+
model_config.enforce_eager,
|
205
|
+
"disable_custom_all_reduce":
|
206
|
+
parallel_config.disable_custom_all_reduce,
|
207
|
+
})
|
208
|
+
|
209
|
+
if self.tokenizer:
|
210
|
+
# Ping the tokenizer to ensure liveness if it runs in a
|
211
|
+
# different process.
|
212
|
+
self.tokenizer.ping()
|
213
|
+
|
214
|
+
# Create the scheduler.
|
215
|
+
# NOTE: the cache_config here have been updated with the numbers of
|
216
|
+
# GPU and CPU blocks, which are profiled in the distributed executor.
|
217
|
+
self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
|
218
|
+
|
219
|
+
# Metric Logging.
|
220
|
+
if self.log_stats:
|
221
|
+
self.stat_logger = StatLogger(
|
222
|
+
local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
|
223
|
+
labels=dict(model_name=model_config.served_model_name),
|
224
|
+
max_model_len=self.model_config.max_model_len)
|
225
|
+
self.stat_logger.info("cache_config", self.cache_config)
|
226
|
+
|
227
|
+
# Create sequence output processor, e.g. for beam search or
|
228
|
+
# speculative decoding.
|
229
|
+
self.output_processor = (
|
230
|
+
SequenceGroupOutputProcessor.create_output_processor(
|
231
|
+
self.scheduler_config,
|
232
|
+
self.detokenizer,
|
233
|
+
self.scheduler,
|
234
|
+
self.seq_counter,
|
235
|
+
self.get_tokenizer_for_seq,
|
236
|
+
stop_checker=StopChecker(
|
237
|
+
self.scheduler_config.max_model_len,
|
238
|
+
self.get_tokenizer_for_seq,
|
239
|
+
),
|
240
|
+
))
|
241
|
+
|
242
|
+
def _initialize_kv_caches(self) -> None:
|
243
|
+
"""Initialize the KV cache in the worker(s).
|
244
|
+
|
245
|
+
The workers will determine the number of blocks in both the GPU cache
|
246
|
+
and the swap CPU cache.
|
247
|
+
"""
|
248
|
+
num_gpu_blocks, num_cpu_blocks = (
|
249
|
+
self.model_executor.determine_num_available_blocks())
|
250
|
+
|
251
|
+
if self.cache_config.num_gpu_blocks_override is not None:
|
252
|
+
num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
|
253
|
+
logger.info(
|
254
|
+
"Overriding num_gpu_blocks=%d with "
|
255
|
+
"num_gpu_blocks_override=%d", num_gpu_blocks,
|
256
|
+
num_gpu_blocks_override)
|
257
|
+
num_gpu_blocks = num_gpu_blocks_override
|
258
|
+
|
259
|
+
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
260
|
+
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
261
|
+
|
262
|
+
self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
|
263
|
+
|
264
|
+
@classmethod
|
265
|
+
def from_engine_args(
|
266
|
+
cls,
|
267
|
+
engine_args: EngineArgs,
|
268
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
269
|
+
) -> "LLMEngine":
|
270
|
+
"""Creates an LLM engine from the engine arguments."""
|
271
|
+
# Create the engine configs.
|
272
|
+
engine_config = engine_args.create_engine_config()
|
273
|
+
|
274
|
+
# Initialize the cluster and specify the executor class.
|
275
|
+
if engine_config.device_config.device_type == "neuron":
|
276
|
+
from vllm.executor.neuron_executor import NeuronExecutor
|
277
|
+
executor_class = NeuronExecutor
|
278
|
+
elif engine_config.device_config.device_type == "cpu":
|
279
|
+
from vllm.executor.cpu_executor import CPUExecutor
|
280
|
+
executor_class = CPUExecutor
|
281
|
+
elif engine_config.parallel_config.worker_use_ray:
|
282
|
+
initialize_ray_cluster(engine_config.parallel_config)
|
283
|
+
from vllm.executor.ray_gpu_executor import RayGPUExecutor
|
284
|
+
executor_class = RayGPUExecutor
|
285
|
+
else:
|
286
|
+
assert engine_config.parallel_config.world_size == 1, (
|
287
|
+
"Ray is required if parallel_config.world_size > 1.")
|
288
|
+
from vllm.executor.gpu_executor import GPUExecutor
|
289
|
+
executor_class = GPUExecutor
|
290
|
+
|
291
|
+
# Create the LLM engine.
|
292
|
+
engine = cls(
|
293
|
+
**engine_config.to_dict(),
|
294
|
+
executor_class=executor_class,
|
295
|
+
log_stats=not engine_args.disable_log_stats,
|
296
|
+
usage_context=usage_context,
|
297
|
+
)
|
298
|
+
return engine
|
299
|
+
|
300
|
+
def __reduce__(self):
|
301
|
+
# This is to ensure that the LLMEngine is not referenced in
|
302
|
+
# the closure used to initialize Ray worker actors
|
303
|
+
raise RuntimeError("LLMEngine should not be pickled!")
|
304
|
+
|
305
|
+
def __del__(self):
|
306
|
+
# Shutdown model executor when engine is garbage collected
|
307
|
+
# Use getattr since __init__ can fail before the field is set
|
308
|
+
if model_executor := getattr(self, "model_executor", None):
|
309
|
+
model_executor.shutdown()
|
310
|
+
|
311
|
+
def get_tokenizer(self) -> "PreTrainedTokenizer":
|
312
|
+
return self.tokenizer.get_lora_tokenizer(None)
|
313
|
+
|
314
|
+
def get_tokenizer_for_seq(self,
|
315
|
+
sequence: Sequence) -> "PreTrainedTokenizer":
|
316
|
+
return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
|
317
|
+
|
318
|
+
def _init_tokenizer(self, **tokenizer_init_kwargs):
|
319
|
+
init_kwargs = dict(
|
320
|
+
tokenizer_id=self.model_config.tokenizer,
|
321
|
+
enable_lora=bool(self.lora_config),
|
322
|
+
max_num_seqs=self.scheduler_config.max_num_seqs,
|
323
|
+
max_input_length=None,
|
324
|
+
tokenizer_mode=self.model_config.tokenizer_mode,
|
325
|
+
trust_remote_code=self.model_config.trust_remote_code,
|
326
|
+
revision=self.model_config.tokenizer_revision)
|
327
|
+
init_kwargs.update(tokenizer_init_kwargs)
|
328
|
+
self.tokenizer = get_tokenizer_group(
|
329
|
+
self.parallel_config.tokenizer_pool_config, **init_kwargs)
|
330
|
+
|
331
|
+
def _verify_args(self) -> None:
|
332
|
+
self.model_config.verify_with_parallel_config(self.parallel_config)
|
333
|
+
self.cache_config.verify_with_parallel_config(self.parallel_config)
|
334
|
+
if self.lora_config:
|
335
|
+
self.lora_config.verify_with_model_config(self.model_config)
|
336
|
+
self.lora_config.verify_with_scheduler_config(
|
337
|
+
self.scheduler_config)
|
338
|
+
|
339
|
+
def encode_request(
|
340
|
+
self,
|
341
|
+
request_id: str, # pylint: disable=unused-argument
|
342
|
+
prompt: Optional[str],
|
343
|
+
prompt_token_ids: Optional[List[int]] = None,
|
344
|
+
lora_request: Optional[LoRARequest] = None,
|
345
|
+
):
|
346
|
+
if prompt_token_ids is None:
|
347
|
+
assert prompt is not None
|
348
|
+
prompt_token_ids = self.tokenizer.encode(request_id=request_id,
|
349
|
+
prompt=prompt,
|
350
|
+
lora_request=lora_request)
|
351
|
+
return prompt_token_ids
|
352
|
+
|
353
|
+
def add_request(
|
354
|
+
self,
|
355
|
+
request_id: str,
|
356
|
+
prompt: Optional[str],
|
357
|
+
sampling_params: SamplingParams,
|
358
|
+
prompt_token_ids: Optional[List[int]] = None,
|
359
|
+
arrival_time: Optional[float] = None,
|
360
|
+
lora_request: Optional[LoRARequest] = None,
|
361
|
+
multi_modal_data: Optional[MultiModalData] = None,
|
362
|
+
) -> None:
|
363
|
+
"""Add a request to the engine's request pool.
|
364
|
+
|
365
|
+
The request is added to the request pool and will be processed by the
|
366
|
+
scheduler as `engine.step()` is called. The exact scheduling policy is
|
367
|
+
determined by the scheduler.
|
368
|
+
|
369
|
+
Args:
|
370
|
+
request_id: The unique ID of the request.
|
371
|
+
prompt: The prompt string. Can be None if prompt_token_ids is
|
372
|
+
provided.
|
373
|
+
sampling_params: The sampling parameters for text generation.
|
374
|
+
prompt_token_ids: The token IDs of the prompt. If None, we
|
375
|
+
use the tokenizer to convert the prompts to token IDs.
|
376
|
+
arrival_time: The arrival time of the request. If None, we use
|
377
|
+
the current monotonic time.
|
378
|
+
multi_modal_data: Multi modal data per request.
|
379
|
+
|
380
|
+
Details:
|
381
|
+
- Set arrival_time to the current time if it is None.
|
382
|
+
- Set prompt_token_ids to the encoded prompt if it is None.
|
383
|
+
- Create `best_of` number of :class:`~vllm.Sequence` objects.
|
384
|
+
- Create a :class:`~vllm.SequenceGroup` object
|
385
|
+
from the list of :class:`~vllm.Sequence`.
|
386
|
+
- Add the :class:`~vllm.SequenceGroup` object to the scheduler.
|
387
|
+
|
388
|
+
Example:
|
389
|
+
>>> # initialize engine
|
390
|
+
>>> engine = LLMEngine.from_engine_args(engine_args)
|
391
|
+
>>> # set request arguments
|
392
|
+
>>> example_prompt = "Who is the president of the United States?"
|
393
|
+
>>> sampling_params = SamplingParams(temperature=0.0)
|
394
|
+
>>> request_id = 0
|
395
|
+
>>>
|
396
|
+
>>> # add the request to the engine
|
397
|
+
>>> engine.add_request(
|
398
|
+
>>> str(request_id),
|
399
|
+
>>> example_prompt,
|
400
|
+
>>> SamplingParams(temperature=0.0))
|
401
|
+
>>> # continue the request processing
|
402
|
+
>>> ...
|
403
|
+
"""
|
404
|
+
if lora_request is not None and not self.lora_config:
|
405
|
+
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
406
|
+
"not enabled!")
|
407
|
+
max_logprobs = self.get_model_config().max_logprobs
|
408
|
+
if (sampling_params.logprobs
|
409
|
+
and sampling_params.logprobs > max_logprobs) or (
|
410
|
+
sampling_params.prompt_logprobs
|
411
|
+
and sampling_params.prompt_logprobs > max_logprobs):
|
412
|
+
raise ValueError(f"Cannot request more than "
|
413
|
+
f"{max_logprobs} logprobs.")
|
414
|
+
if arrival_time is None:
|
415
|
+
arrival_time = time.time()
|
416
|
+
prompt_token_ids = self.encode_request(
|
417
|
+
request_id=request_id,
|
418
|
+
prompt=prompt,
|
419
|
+
prompt_token_ids=prompt_token_ids,
|
420
|
+
lora_request=lora_request)
|
421
|
+
|
422
|
+
# Create the sequences.
|
423
|
+
block_size = self.cache_config.block_size
|
424
|
+
seq_id = next(self.seq_counter)
|
425
|
+
eos_token_id = None
|
426
|
+
if self.tokenizer:
|
427
|
+
eos_token_id = self.tokenizer.get_lora_tokenizer(
|
428
|
+
lora_request).eos_token_id
|
429
|
+
else:
|
430
|
+
logger.warning("Use None for EOS token id because tokenizer is "
|
431
|
+
"not initialized")
|
432
|
+
seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
|
433
|
+
eos_token_id, lora_request)
|
434
|
+
|
435
|
+
# Defensive copy of SamplingParams, which are used by the sampler,
|
436
|
+
# this doesn't deep-copy LogitsProcessor objects
|
437
|
+
sampling_params = sampling_params.clone()
|
438
|
+
# Add the eos token id into the sampling_params to support min_tokens
|
439
|
+
# processing
|
440
|
+
if seq.eos_token_id is not None:
|
441
|
+
sampling_params.all_stop_token_ids.add(seq.eos_token_id)
|
442
|
+
sampling_params.update_from_generation_config(
|
443
|
+
self.generation_config_fields)
|
444
|
+
|
445
|
+
# Create the sequence group.
|
446
|
+
seq_group = SequenceGroup(request_id, [seq], sampling_params,
|
447
|
+
arrival_time, lora_request, multi_modal_data)
|
448
|
+
|
449
|
+
# Add the sequence group to the scheduler.
|
450
|
+
self.scheduler.add_seq_group(seq_group)
|
451
|
+
|
452
|
+
def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
|
453
|
+
"""Aborts a request(s) with the given ID.
|
454
|
+
|
455
|
+
Args:
|
456
|
+
request_id: The ID(s) of the request to abort.
|
457
|
+
|
458
|
+
Details:
|
459
|
+
- Refer to the
|
460
|
+
:meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
|
461
|
+
from class :class:`~vllm.core.scheduler.Scheduler`.
|
462
|
+
|
463
|
+
Example:
|
464
|
+
>>> # initialize engine and add a request with request_id
|
465
|
+
>>> request_id = str(0)
|
466
|
+
>>> # abort the request
|
467
|
+
>>> engine.abort_request(request_id)
|
468
|
+
"""
|
469
|
+
self.scheduler.abort_seq_group(request_id)
|
470
|
+
|
471
|
+
def get_model_config(self) -> ModelConfig:
|
472
|
+
"""Gets the model configuration."""
|
473
|
+
return self.model_config
|
474
|
+
|
475
|
+
def get_decoding_config(self) -> DecodingConfig:
|
476
|
+
"""Gets the decoding configuration."""
|
477
|
+
return self.decoding_config
|
478
|
+
|
479
|
+
def get_num_unfinished_requests(self) -> int:
|
480
|
+
"""Gets the number of unfinished requests."""
|
481
|
+
return self.scheduler.get_num_unfinished_seq_groups()
|
482
|
+
|
483
|
+
def has_unfinished_requests(self) -> bool:
|
484
|
+
"""Returns True if there are unfinished requests."""
|
485
|
+
return self.scheduler.has_unfinished_seqs()
|
486
|
+
|
487
|
+
def _process_model_outputs(
|
488
|
+
self,
|
489
|
+
output: List[SamplerOutput],
|
490
|
+
scheduled_seq_groups: List[ScheduledSequenceGroup],
|
491
|
+
ignored_seq_groups: List[SequenceGroup],
|
492
|
+
seq_group_metadata_list: List[SequenceGroupMetadata],
|
493
|
+
) -> List[RequestOutput]:
|
494
|
+
"""Apply the model output to the sequences in the scheduled seq groups.
|
495
|
+
|
496
|
+
Returns RequestOutputs that can be returned to the client.
|
497
|
+
"""
|
498
|
+
|
499
|
+
now = time.time()
|
500
|
+
|
501
|
+
# Organize outputs by [sequence group][step] instead of
|
502
|
+
# [step][sequence group].
|
503
|
+
output_by_sequence_group = create_output_by_sequence_group(
|
504
|
+
sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups))
|
505
|
+
|
506
|
+
# Update the scheduled sequence groups with the model outputs.
|
507
|
+
for scheduled_seq_group, outputs, seq_group_meta in zip(
|
508
|
+
scheduled_seq_groups, output_by_sequence_group,
|
509
|
+
seq_group_metadata_list):
|
510
|
+
seq_group = scheduled_seq_group.seq_group
|
511
|
+
seq_group.update_num_computed_tokens(
|
512
|
+
scheduled_seq_group.token_chunk_size)
|
513
|
+
|
514
|
+
self.output_processor.process_prompt_logprob(seq_group, outputs)
|
515
|
+
if seq_group_meta.do_sample:
|
516
|
+
self.output_processor.process_outputs(seq_group, outputs)
|
517
|
+
|
518
|
+
# Free the finished sequence groups.
|
519
|
+
self.scheduler.free_finished_seq_groups()
|
520
|
+
|
521
|
+
# Create the outputs.
|
522
|
+
request_outputs: List[RequestOutput] = []
|
523
|
+
for scheduled_seq_group in scheduled_seq_groups:
|
524
|
+
seq_group = scheduled_seq_group.seq_group
|
525
|
+
seq_group.maybe_set_first_token_time(now)
|
526
|
+
request_output = RequestOutput.from_seq_group(seq_group)
|
527
|
+
request_outputs.append(request_output)
|
528
|
+
for seq_group in ignored_seq_groups:
|
529
|
+
request_output = RequestOutput.from_seq_group(seq_group)
|
530
|
+
request_outputs.append(request_output)
|
531
|
+
return request_outputs
|
532
|
+
|
533
|
+
def step(self) -> List[RequestOutput]:
|
534
|
+
"""Performs one decoding iteration and returns newly generated results.
|
535
|
+
|
536
|
+
.. figure:: https://i.imgur.com/sv2HssD.png
|
537
|
+
:alt: Overview of the step function
|
538
|
+
:align: center
|
539
|
+
|
540
|
+
Overview of the step function.
|
541
|
+
|
542
|
+
Details:
|
543
|
+
- Step 1: Schedules the sequences to be executed in the next
|
544
|
+
iteration and the token blocks to be swapped in/out/copy.
|
545
|
+
|
546
|
+
- Depending on the scheduling policy,
|
547
|
+
sequences may be `preempted/reordered`.
|
548
|
+
- A Sequence Group (SG) refer to a group of sequences
|
549
|
+
that are generated from the same prompt.
|
550
|
+
|
551
|
+
- Step 2: Calls the distributed executor to execute the model.
|
552
|
+
- Step 3: Processes the model output. This mainly includes:
|
553
|
+
|
554
|
+
- Decodes the relevant outputs.
|
555
|
+
- Updates the scheduled sequence groups with model outputs
|
556
|
+
based on its `sampling parameters` (`use_beam_search` or not).
|
557
|
+
- Frees the finished sequence groups.
|
558
|
+
|
559
|
+
- Finally, it creates and returns the newly generated results.
|
560
|
+
|
561
|
+
Example:
|
562
|
+
>>> # Please see the example/ folder for more detailed examples.
|
563
|
+
>>>
|
564
|
+
>>> # initialize engine and request arguments
|
565
|
+
>>> engine = LLMEngine.from_engine_args(engine_args)
|
566
|
+
>>> example_inputs = [(0, "What is LLM?",
|
567
|
+
>>> SamplingParams(temperature=0.0))]
|
568
|
+
>>>
|
569
|
+
>>> # Start the engine with an event loop
|
570
|
+
>>> while True:
|
571
|
+
>>> if example_inputs:
|
572
|
+
>>> req_id, prompt, sampling_params = example_inputs.pop(0)
|
573
|
+
>>> engine.add_request(str(req_id), prompt, sampling_params)
|
574
|
+
>>>
|
575
|
+
>>> # continue the request processing
|
576
|
+
>>> request_outputs = engine.step()
|
577
|
+
>>> for request_output in request_outputs:
|
578
|
+
>>> if request_output.finished:
|
579
|
+
>>> # return or show the request output
|
580
|
+
>>>
|
581
|
+
>>> if not (engine.has_unfinished_requests() or example_inputs):
|
582
|
+
>>> break
|
583
|
+
"""
|
584
|
+
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
|
585
|
+
|
586
|
+
if not scheduler_outputs.is_empty():
|
587
|
+
execute_model_req = ExecuteModelRequest(
|
588
|
+
seq_group_metadata_list=seq_group_metadata_list,
|
589
|
+
blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
|
590
|
+
blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
|
591
|
+
blocks_to_copy=scheduler_outputs.blocks_to_copy,
|
592
|
+
num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
|
593
|
+
running_queue_size=scheduler_outputs.running_queue_size,
|
594
|
+
)
|
595
|
+
output = self.model_executor.execute_model(
|
596
|
+
execute_model_req=execute_model_req)
|
597
|
+
else:
|
598
|
+
output = []
|
599
|
+
|
600
|
+
request_outputs = self._process_model_outputs(
|
601
|
+
output, scheduler_outputs.scheduled_seq_groups,
|
602
|
+
scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
|
603
|
+
|
604
|
+
# Log stats.
|
605
|
+
self.do_log_stats(scheduler_outputs, output)
|
606
|
+
|
607
|
+
return request_outputs
|
608
|
+
|
609
|
+
def do_log_stats(
|
610
|
+
self,
|
611
|
+
scheduler_outputs: Optional[SchedulerOutputs] = None,
|
612
|
+
model_output: Optional[List[SamplerOutput]] = None) -> None:
|
613
|
+
"""Forced log when no requests active."""
|
614
|
+
if self.log_stats:
|
615
|
+
self.stat_logger.log(
|
616
|
+
self._get_stats(scheduler_outputs, model_output))
|
617
|
+
|
618
|
+
def _get_stats(
|
619
|
+
self,
|
620
|
+
scheduler_outputs: Optional[SchedulerOutputs],
|
621
|
+
model_output: Optional[List[SamplerOutput]] = None) -> Stats:
|
622
|
+
"""Get Stats to be Logged to Prometheus.
|
623
|
+
|
624
|
+
Args:
|
625
|
+
scheduler_outputs: Optional, used to populate metrics related to
|
626
|
+
the scheduled batch,
|
627
|
+
model_output: Optional, used to emit speculative decoding metrics
|
628
|
+
which are created by the workers.
|
629
|
+
"""
|
630
|
+
now = time.time()
|
631
|
+
|
632
|
+
# System State
|
633
|
+
# Scheduler State
|
634
|
+
num_running_sys = len(self.scheduler.running)
|
635
|
+
num_swapped_sys = len(self.scheduler.swapped)
|
636
|
+
num_waiting_sys = len(self.scheduler.waiting)
|
637
|
+
|
638
|
+
# KV Cache Usage in %
|
639
|
+
num_total_gpu = self.cache_config.num_gpu_blocks
|
640
|
+
num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
|
641
|
+
gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
|
642
|
+
|
643
|
+
num_total_cpu = self.cache_config.num_cpu_blocks
|
644
|
+
cpu_cache_usage_sys = 0.
|
645
|
+
if num_total_cpu > 0:
|
646
|
+
num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
|
647
|
+
)
|
648
|
+
cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
|
649
|
+
|
650
|
+
# Iteration stats
|
651
|
+
num_prompt_tokens_iter = 0
|
652
|
+
num_generation_tokens_iter = 0
|
653
|
+
time_to_first_tokens_iter: List[float] = []
|
654
|
+
time_per_output_tokens_iter: List[float] = []
|
655
|
+
|
656
|
+
# Request stats
|
657
|
+
# Latency
|
658
|
+
time_e2e_requests: List[float] = []
|
659
|
+
# Metadata
|
660
|
+
num_prompt_tokens_requests: List[int] = []
|
661
|
+
num_generation_tokens_requests: List[int] = []
|
662
|
+
best_of_requests: List[int] = []
|
663
|
+
n_requests: List[int] = []
|
664
|
+
finished_reason_requests: List[str] = []
|
665
|
+
|
666
|
+
# NOTE: This loop assumes prefill seq_groups are before
|
667
|
+
# decode seq_groups in scheduled_seq_groups.
|
668
|
+
if scheduler_outputs is not None:
|
669
|
+
num_generation_tokens_from_prefill_groups = 0.
|
670
|
+
# NOTE: if scheduler_outputs.num_prefill_groups > 0 and
|
671
|
+
# the len of scheduler_outputs.scheduled_seq_groups is !=
|
672
|
+
# scheduler_outputs.num_prefill_groups, this means that
|
673
|
+
# chunked prefills have been detected.
|
674
|
+
|
675
|
+
for idx, scheduled_seq_group in enumerate(
|
676
|
+
scheduler_outputs.scheduled_seq_groups):
|
677
|
+
group_was_prefill = idx < scheduler_outputs.num_prefill_groups
|
678
|
+
seq_group = scheduled_seq_group.seq_group
|
679
|
+
|
680
|
+
# NOTE: a seq_group that completed all of its prefill tokens
|
681
|
+
# in the last iteration will have seq_group.is_prefill() = False
|
682
|
+
# with group_was_prefill = True
|
683
|
+
if group_was_prefill:
|
684
|
+
# Number of prompt tokens.
|
685
|
+
num_prompt_tokens_iter += (
|
686
|
+
scheduled_seq_group.token_chunk_size)
|
687
|
+
|
688
|
+
# If the seq_group just finished the prefill state
|
689
|
+
# get TTFT.
|
690
|
+
if not seq_group.is_prefill():
|
691
|
+
latency = seq_group.get_last_latency(now)
|
692
|
+
time_to_first_tokens_iter.append(latency)
|
693
|
+
|
694
|
+
# One generation token per finished prefill.
|
695
|
+
num_generation_tokens_from_prefill_groups += (
|
696
|
+
seq_group.num_seqs())
|
697
|
+
else:
|
698
|
+
# TPOTs.
|
699
|
+
latency = seq_group.get_last_latency(now)
|
700
|
+
time_per_output_tokens_iter.append(latency)
|
701
|
+
|
702
|
+
# Because of chunked prefill, we can have a single sequence
|
703
|
+
# group that does multiple prompt_runs. To prevent logging
|
704
|
+
# the same metadata more than once per request, we standardize
|
705
|
+
# on logging request level information for finished requests,
|
706
|
+
# which can only happen once.
|
707
|
+
if seq_group.is_finished():
|
708
|
+
# Latency timings
|
709
|
+
time_e2e_requests.append(now -
|
710
|
+
seq_group.metrics.arrival_time)
|
711
|
+
|
712
|
+
# Metadata
|
713
|
+
num_prompt_tokens_requests.append(
|
714
|
+
len(seq_group.prompt_token_ids))
|
715
|
+
num_generation_tokens_requests.extend([
|
716
|
+
seq.get_output_len()
|
717
|
+
for seq in seq_group.get_finished_seqs()
|
718
|
+
])
|
719
|
+
best_of_requests.append(seq_group.sampling_params.best_of)
|
720
|
+
n_requests.append(seq_group.sampling_params.n)
|
721
|
+
finished_reason_requests.extend([
|
722
|
+
SequenceStatus.get_finished_reason(seq.status)
|
723
|
+
for seq in seq_group.get_finished_seqs()
|
724
|
+
])
|
725
|
+
|
726
|
+
# Number of generation tokens.
|
727
|
+
# num_batched_tokens equals the number of prompt_tokens plus the
|
728
|
+
# number of decode_tokens in a single iteration. So,
|
729
|
+
# num_generation_tokens = num_batched_tokens - num_prompt_tokens
|
730
|
+
# + num_generation_tokens_from_prefill_groups (since we generate
|
731
|
+
# one token on prefills on iters where the prefill finishes).
|
732
|
+
num_generation_tokens_iter = (
|
733
|
+
scheduler_outputs.num_batched_tokens - num_prompt_tokens_iter +
|
734
|
+
num_generation_tokens_from_prefill_groups)
|
735
|
+
|
736
|
+
# Spec decode, if enabled, emits specialized metrics from the worker in
|
737
|
+
# sampler output.
|
738
|
+
if model_output and (model_output[0].spec_decode_worker_metrics
|
739
|
+
is not None):
|
740
|
+
spec_decode_metrics = model_output[0].spec_decode_worker_metrics
|
741
|
+
else:
|
742
|
+
spec_decode_metrics = None
|
743
|
+
|
744
|
+
return Stats(
|
745
|
+
now=now,
|
746
|
+
|
747
|
+
# System stats
|
748
|
+
# Scheduler State
|
749
|
+
num_running_sys=num_running_sys,
|
750
|
+
num_swapped_sys=num_swapped_sys,
|
751
|
+
num_waiting_sys=num_waiting_sys,
|
752
|
+
# KV Cache Usage in %
|
753
|
+
gpu_cache_usage_sys=gpu_cache_usage_sys,
|
754
|
+
cpu_cache_usage_sys=cpu_cache_usage_sys,
|
755
|
+
|
756
|
+
# Iteration stats
|
757
|
+
num_prompt_tokens_iter=num_prompt_tokens_iter,
|
758
|
+
num_generation_tokens_iter=num_generation_tokens_iter,
|
759
|
+
time_to_first_tokens_iter=time_to_first_tokens_iter,
|
760
|
+
time_per_output_tokens_iter=time_per_output_tokens_iter,
|
761
|
+
spec_decode_metrics=spec_decode_metrics,
|
762
|
+
|
763
|
+
# Request stats
|
764
|
+
# Latency
|
765
|
+
time_e2e_requests=time_e2e_requests,
|
766
|
+
# Metadata
|
767
|
+
num_prompt_tokens_requests=num_prompt_tokens_requests,
|
768
|
+
num_generation_tokens_requests=num_generation_tokens_requests,
|
769
|
+
best_of_requests=best_of_requests,
|
770
|
+
n_requests=n_requests,
|
771
|
+
finished_reason_requests=finished_reason_requests,
|
772
|
+
)
|
773
|
+
|
774
|
+
def add_lora(self, lora_request: LoRARequest) -> bool:
|
775
|
+
return self.model_executor.add_lora(lora_request)
|
776
|
+
|
777
|
+
def remove_lora(self, lora_id: int) -> bool:
|
778
|
+
return self.model_executor.remove_lora(lora_id)
|
779
|
+
|
780
|
+
def list_loras(self) -> List[int]:
|
781
|
+
return self.model_executor.list_loras()
|
782
|
+
|
783
|
+
def check_health(self) -> None:
|
784
|
+
self.model_executor.check_health()
|