vllm-npu 0.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vllm/__init__.py +23 -0
- vllm/_custom_ops.py +251 -0
- vllm/attention/__init__.py +13 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +127 -0
- vllm/attention/backends/flash_attn.py +271 -0
- vllm/attention/backends/flashinfer.py +220 -0
- vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm/attention/backends/torch_sdpa.py +250 -0
- vllm/attention/backends/xformers.py +393 -0
- vllm/attention/layer.py +56 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/paged_attn.py +216 -0
- vllm/attention/ops/prefix_prefill.py +792 -0
- vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm/attention/selector.py +91 -0
- vllm/block.py +84 -0
- vllm/config.py +1225 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +295 -0
- vllm/core/block/common.py +199 -0
- vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm/core/block/interfaces.py +205 -0
- vllm/core/block/naive_block.py +318 -0
- vllm/core/block/prefix_caching_block.py +606 -0
- vllm/core/block_manager_v1.py +625 -0
- vllm/core/block_manager_v2.py +258 -0
- vllm/core/evictor_v1.py +105 -0
- vllm/core/evictor_v2.py +127 -0
- vllm/core/interfaces.py +113 -0
- vllm/core/policy.py +45 -0
- vllm/core/scheduler.py +1163 -0
- vllm/distributed/__init__.py +3 -0
- vllm/distributed/communication_op.py +237 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm/distributed/parallel_state.py +339 -0
- vllm/distributed/utils.py +136 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +649 -0
- vllm/engine/async_llm_engine.py +737 -0
- vllm/engine/llm_engine.py +784 -0
- vllm/engine/metrics.py +368 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +76 -0
- vllm/engine/output_processor/multi_step.py +142 -0
- vllm/engine/output_processor/single_step.py +284 -0
- vllm/engine/output_processor/stop_checker.py +101 -0
- vllm/engine/output_processor/util.py +19 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +119 -0
- vllm/entrypoints/llm.py +259 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +186 -0
- vllm/entrypoints/openai/cli_args.py +115 -0
- vllm/entrypoints/openai/protocol.py +460 -0
- vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm/envs.py +217 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/cpu_executor.py +152 -0
- vllm/executor/distributed_gpu_executor.py +115 -0
- vllm/executor/executor_base.py +115 -0
- vllm/executor/gpu_executor.py +150 -0
- vllm/executor/multiproc_worker_utils.py +263 -0
- vllm/executor/neuron_executor.py +91 -0
- vllm/executor/ray_gpu_executor.py +327 -0
- vllm/executor/ray_utils.py +119 -0
- vllm/logger.py +153 -0
- vllm/logging/__init__.py +5 -0
- vllm/logging/formatter.py +15 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +262 -0
- vllm/lora/layers.py +1181 -0
- vllm/lora/lora.py +167 -0
- vllm/lora/models.py +645 -0
- vllm/lora/punica.py +213 -0
- vllm/lora/request.py +32 -0
- vllm/lora/utils.py +98 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +7 -0
- vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +173 -0
- vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm/model_executor/layers/layernorm.py +71 -0
- vllm/model_executor/layers/linear.py +709 -0
- vllm/model_executor/layers/logits_processor.py +115 -0
- vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm/model_executor/layers/ops/rand.py +157 -0
- vllm/model_executor/layers/ops/sample.py +406 -0
- vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm/model_executor/layers/sampler.py +1051 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm/model_executor/model_loader/__init__.py +30 -0
- vllm/model_executor/model_loader/loader.py +362 -0
- vllm/model_executor/model_loader/neuron.py +136 -0
- vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm/model_executor/model_loader/utils.py +41 -0
- vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm/model_executor/models/__init__.py +119 -0
- vllm/model_executor/models/baichuan.py +410 -0
- vllm/model_executor/models/bloom.py +327 -0
- vllm/model_executor/models/chatglm.py +386 -0
- vllm/model_executor/models/commandr.py +373 -0
- vllm/model_executor/models/dbrx.py +413 -0
- vllm/model_executor/models/decilm.py +122 -0
- vllm/model_executor/models/deepseek.py +438 -0
- vllm/model_executor/models/falcon.py +444 -0
- vllm/model_executor/models/gemma.py +393 -0
- vllm/model_executor/models/gpt2.py +266 -0
- vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm/model_executor/models/gpt_j.py +281 -0
- vllm/model_executor/models/gpt_neox.py +295 -0
- vllm/model_executor/models/internlm2.py +323 -0
- vllm/model_executor/models/jais.py +333 -0
- vllm/model_executor/models/llama.py +442 -0
- vllm/model_executor/models/llava.py +239 -0
- vllm/model_executor/models/minicpm.py +531 -0
- vllm/model_executor/models/mixtral.py +583 -0
- vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm/model_executor/models/mpt.py +295 -0
- vllm/model_executor/models/olmo.py +356 -0
- vllm/model_executor/models/opt.py +349 -0
- vllm/model_executor/models/orion.py +319 -0
- vllm/model_executor/models/phi.py +300 -0
- vllm/model_executor/models/qwen.py +284 -0
- vllm/model_executor/models/qwen2.py +367 -0
- vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm/model_executor/models/stablelm.py +301 -0
- vllm/model_executor/models/starcoder2.py +302 -0
- vllm/model_executor/models/xverse.py +366 -0
- vllm/model_executor/sampling_metadata.py +588 -0
- vllm/model_executor/utils.py +35 -0
- vllm/outputs.py +150 -0
- vllm/py.typed +2 -0
- vllm/sampling_params.py +340 -0
- vllm/sequence.py +766 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +397 -0
- vllm/spec_decode/interfaces.py +73 -0
- vllm/spec_decode/metrics.py +191 -0
- vllm/spec_decode/multi_step_worker.py +203 -0
- vllm/spec_decode/ngram_worker.py +176 -0
- vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm/spec_decode/top1_proposer.py +200 -0
- vllm/spec_decode/util.py +228 -0
- vllm/test_utils.py +41 -0
- vllm/transformers_utils/__init__.py +0 -0
- vllm/transformers_utils/config.py +58 -0
- vllm/transformers_utils/configs/__init__.py +16 -0
- vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm/transformers_utils/configs/falcon.py +87 -0
- vllm/transformers_utils/configs/jais.py +236 -0
- vllm/transformers_utils/configs/mpt.py +178 -0
- vllm/transformers_utils/detokenizer.py +313 -0
- vllm/transformers_utils/tokenizer.py +149 -0
- vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +209 -0
- vllm/utils.py +677 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +105 -0
- vllm/worker/cpu_model_runner.py +346 -0
- vllm/worker/cpu_worker.py +321 -0
- vllm/worker/model_runner.py +1168 -0
- vllm/worker/neuron_model_runner.py +196 -0
- vllm/worker/neuron_worker.py +98 -0
- vllm/worker/worker.py +345 -0
- vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2.dist-info/LICENSE +201 -0
- vllm_npu-0.4.2.dist-info/METADATA +173 -0
- vllm_npu-0.4.2.dist-info/RECORD +219 -0
- vllm_npu-0.4.2.dist-info/WHEEL +5 -0
- vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
vllm/engine/metrics.py
ADDED
@@ -0,0 +1,368 @@
|
|
1
|
+
import time
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import TYPE_CHECKING
|
4
|
+
from typing import Counter as CollectionsCounter
|
5
|
+
from typing import Dict, List, Optional, Protocol, Union
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
|
9
|
+
disable_created_metrics)
|
10
|
+
|
11
|
+
from vllm.logger import init_logger
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
|
15
|
+
|
16
|
+
logger = init_logger(__name__)
|
17
|
+
|
18
|
+
disable_created_metrics()
|
19
|
+
|
20
|
+
# The begin-* and end* here are used by the documentation generator
|
21
|
+
# to extract the metrics definitions.
|
22
|
+
|
23
|
+
|
24
|
+
# begin-metrics-definitions
|
25
|
+
class Metrics:
|
26
|
+
labelname_finish_reason = "finished_reason"
|
27
|
+
|
28
|
+
def __init__(self, labelnames: List[str], max_model_len: int):
|
29
|
+
# Unregister any existing vLLM collectors
|
30
|
+
for collector in list(REGISTRY._collector_to_names):
|
31
|
+
if hasattr(collector, "_name") and "vllm" in collector._name:
|
32
|
+
REGISTRY.unregister(collector)
|
33
|
+
|
34
|
+
# Config Information
|
35
|
+
self.info_cache_config = Info(
|
36
|
+
name='vllm:cache_config',
|
37
|
+
documentation='information of cache_config')
|
38
|
+
|
39
|
+
# System stats
|
40
|
+
# Scheduler State
|
41
|
+
self.gauge_scheduler_running = Gauge(
|
42
|
+
name="vllm:num_requests_running",
|
43
|
+
documentation="Number of requests currently running on GPU.",
|
44
|
+
labelnames=labelnames)
|
45
|
+
self.gauge_scheduler_waiting = Gauge(
|
46
|
+
name="vllm:num_requests_waiting",
|
47
|
+
documentation="Number of requests waiting to be processed.",
|
48
|
+
labelnames=labelnames)
|
49
|
+
self.gauge_scheduler_swapped = Gauge(
|
50
|
+
name="vllm:num_requests_swapped",
|
51
|
+
documentation="Number of requests swapped to CPU.",
|
52
|
+
labelnames=labelnames)
|
53
|
+
# KV Cache Usage in %
|
54
|
+
self.gauge_gpu_cache_usage = Gauge(
|
55
|
+
name="vllm:gpu_cache_usage_perc",
|
56
|
+
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
|
57
|
+
labelnames=labelnames)
|
58
|
+
self.gauge_cpu_cache_usage = Gauge(
|
59
|
+
name="vllm:cpu_cache_usage_perc",
|
60
|
+
documentation="CPU KV-cache usage. 1 means 100 percent usage.",
|
61
|
+
labelnames=labelnames)
|
62
|
+
|
63
|
+
# Iteration stats
|
64
|
+
self.counter_prompt_tokens = Counter(
|
65
|
+
name="vllm:prompt_tokens_total",
|
66
|
+
documentation="Number of prefill tokens processed.",
|
67
|
+
labelnames=labelnames)
|
68
|
+
self.counter_generation_tokens = Counter(
|
69
|
+
name="vllm:generation_tokens_total",
|
70
|
+
documentation="Number of generation tokens processed.",
|
71
|
+
labelnames=labelnames)
|
72
|
+
self.histogram_time_to_first_token = Histogram(
|
73
|
+
name="vllm:time_to_first_token_seconds",
|
74
|
+
documentation="Histogram of time to first token in seconds.",
|
75
|
+
labelnames=labelnames,
|
76
|
+
buckets=[
|
77
|
+
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
|
78
|
+
0.75, 1.0, 2.5, 5.0, 7.5, 10.0
|
79
|
+
])
|
80
|
+
self.histogram_time_per_output_token = Histogram(
|
81
|
+
name="vllm:time_per_output_token_seconds",
|
82
|
+
documentation="Histogram of time per output token in seconds.",
|
83
|
+
labelnames=labelnames,
|
84
|
+
buckets=[
|
85
|
+
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
86
|
+
1.0, 2.5
|
87
|
+
])
|
88
|
+
|
89
|
+
# Request stats
|
90
|
+
# Latency
|
91
|
+
self.histogram_e2e_time_request = Histogram(
|
92
|
+
name="vllm:e2e_request_latency_seconds",
|
93
|
+
documentation="Histogram of end to end request latency in seconds.",
|
94
|
+
labelnames=labelnames,
|
95
|
+
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
|
96
|
+
# Metadata
|
97
|
+
self.histogram_num_prompt_tokens_request = Histogram(
|
98
|
+
name="vllm:request_prompt_tokens",
|
99
|
+
documentation="Number of prefill tokens processed.",
|
100
|
+
labelnames=labelnames,
|
101
|
+
buckets=build_1_2_5_buckets(max_model_len),
|
102
|
+
)
|
103
|
+
self.histogram_num_generation_tokens_request = Histogram(
|
104
|
+
name="vllm:request_generation_tokens",
|
105
|
+
documentation="Number of generation tokens processed.",
|
106
|
+
labelnames=labelnames,
|
107
|
+
buckets=build_1_2_5_buckets(max_model_len),
|
108
|
+
)
|
109
|
+
self.histogram_best_of_request = Histogram(
|
110
|
+
name="vllm:request_params_best_of",
|
111
|
+
documentation="Histogram of the best_of request parameter.",
|
112
|
+
labelnames=labelnames,
|
113
|
+
buckets=[1, 2, 5, 10, 20],
|
114
|
+
)
|
115
|
+
self.histogram_n_request = Histogram(
|
116
|
+
name="vllm:request_params_n",
|
117
|
+
documentation="Histogram of the n request parameter.",
|
118
|
+
labelnames=labelnames,
|
119
|
+
buckets=[1, 2, 5, 10, 20],
|
120
|
+
)
|
121
|
+
self.counter_request_success = Counter(
|
122
|
+
name="vllm:request_success_total",
|
123
|
+
documentation="Count of successfully processed requests.",
|
124
|
+
labelnames=labelnames + [Metrics.labelname_finish_reason])
|
125
|
+
|
126
|
+
# Deprecated in favor of vllm:prompt_tokens_total
|
127
|
+
self.gauge_avg_prompt_throughput = Gauge(
|
128
|
+
name="vllm:avg_prompt_throughput_toks_per_s",
|
129
|
+
documentation="Average prefill throughput in tokens/s.",
|
130
|
+
labelnames=labelnames,
|
131
|
+
)
|
132
|
+
# Deprecated in favor of vllm:generation_tokens_total
|
133
|
+
self.gauge_avg_generation_throughput = Gauge(
|
134
|
+
name="vllm:avg_generation_throughput_toks_per_s",
|
135
|
+
documentation="Average generation throughput in tokens/s.",
|
136
|
+
labelnames=labelnames,
|
137
|
+
)
|
138
|
+
|
139
|
+
|
140
|
+
# end-metrics-definitions
|
141
|
+
|
142
|
+
|
143
|
+
def build_1_2_5_buckets(max_value: int):
|
144
|
+
"""
|
145
|
+
Builds a list of buckets with increasing powers of 10 multiplied by
|
146
|
+
mantissa values (1, 2, 5) until the value exceeds the specified maximum.
|
147
|
+
|
148
|
+
Example:
|
149
|
+
>>> build_1_2_5_buckets(100)
|
150
|
+
[1, 2, 5, 10, 20, 50, 100]
|
151
|
+
"""
|
152
|
+
mantissa_lst = [1, 2, 5]
|
153
|
+
exponent = 0
|
154
|
+
buckets = []
|
155
|
+
while True:
|
156
|
+
for m in mantissa_lst:
|
157
|
+
value = m * 10**exponent
|
158
|
+
if value <= max_value:
|
159
|
+
buckets.append(value)
|
160
|
+
else:
|
161
|
+
return buckets
|
162
|
+
exponent += 1
|
163
|
+
|
164
|
+
|
165
|
+
@dataclass
|
166
|
+
class Stats:
|
167
|
+
"""Created by LLMEngine for use by StatLogger."""
|
168
|
+
now: float
|
169
|
+
|
170
|
+
# System stats (should have _sys suffix)
|
171
|
+
# Scheduler State
|
172
|
+
num_running_sys: int
|
173
|
+
num_waiting_sys: int
|
174
|
+
num_swapped_sys: int
|
175
|
+
# KV Cache Usage in %
|
176
|
+
gpu_cache_usage_sys: float
|
177
|
+
cpu_cache_usage_sys: float
|
178
|
+
|
179
|
+
# Iteration stats (should have _iter suffix)
|
180
|
+
num_prompt_tokens_iter: int
|
181
|
+
num_generation_tokens_iter: int
|
182
|
+
time_to_first_tokens_iter: List[float]
|
183
|
+
time_per_output_tokens_iter: List[float]
|
184
|
+
|
185
|
+
# Request stats (should have _requests suffix)
|
186
|
+
# Latency
|
187
|
+
time_e2e_requests: List[float]
|
188
|
+
# Metadata
|
189
|
+
num_prompt_tokens_requests: List[int]
|
190
|
+
num_generation_tokens_requests: List[int]
|
191
|
+
best_of_requests: List[int]
|
192
|
+
n_requests: List[int]
|
193
|
+
finished_reason_requests: List[str]
|
194
|
+
|
195
|
+
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
|
196
|
+
|
197
|
+
|
198
|
+
class SupportsMetricsInfo(Protocol):
|
199
|
+
|
200
|
+
def metrics_info(self) -> Dict[str, str]:
|
201
|
+
...
|
202
|
+
|
203
|
+
|
204
|
+
class StatLogger:
|
205
|
+
"""StatLogger is used LLMEngine to log to Promethus and Stdout."""
|
206
|
+
|
207
|
+
def __init__(self, local_interval: float, labels: Dict[str, str],
|
208
|
+
max_model_len: int) -> None:
|
209
|
+
# Metadata for logging locally.
|
210
|
+
self.last_local_log = time.time()
|
211
|
+
self.local_interval = local_interval
|
212
|
+
|
213
|
+
# Tracked stats over current local logging interval.
|
214
|
+
self.num_prompt_tokens: List[int] = []
|
215
|
+
self.num_generation_tokens: List[int] = []
|
216
|
+
|
217
|
+
# Prometheus metrics
|
218
|
+
self.labels = labels
|
219
|
+
self.metrics = Metrics(labelnames=list(labels.keys()),
|
220
|
+
max_model_len=max_model_len)
|
221
|
+
|
222
|
+
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
|
223
|
+
if type == "cache_config":
|
224
|
+
self.metrics.info_cache_config.info(obj.metrics_info())
|
225
|
+
|
226
|
+
def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
|
227
|
+
return float(np.sum(tracked_stats) / (now - self.last_local_log))
|
228
|
+
|
229
|
+
def _local_interval_elapsed(self, now: float) -> bool:
|
230
|
+
elapsed_time = now - self.last_local_log
|
231
|
+
return elapsed_time > self.local_interval
|
232
|
+
|
233
|
+
def _log_prometheus(self, stats: Stats) -> None:
|
234
|
+
# System state data
|
235
|
+
self._log_gauge(self.metrics.gauge_scheduler_running,
|
236
|
+
stats.num_running_sys)
|
237
|
+
self._log_gauge(self.metrics.gauge_scheduler_swapped,
|
238
|
+
stats.num_swapped_sys)
|
239
|
+
self._log_gauge(self.metrics.gauge_scheduler_waiting,
|
240
|
+
stats.num_waiting_sys)
|
241
|
+
self._log_gauge(self.metrics.gauge_gpu_cache_usage,
|
242
|
+
stats.gpu_cache_usage_sys)
|
243
|
+
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
|
244
|
+
stats.cpu_cache_usage_sys)
|
245
|
+
|
246
|
+
# Iteration level data
|
247
|
+
self._log_counter(self.metrics.counter_prompt_tokens,
|
248
|
+
stats.num_prompt_tokens_iter)
|
249
|
+
self._log_counter(self.metrics.counter_generation_tokens,
|
250
|
+
stats.num_generation_tokens_iter)
|
251
|
+
self._log_histogram(self.metrics.histogram_time_to_first_token,
|
252
|
+
stats.time_to_first_tokens_iter)
|
253
|
+
self._log_histogram(self.metrics.histogram_time_per_output_token,
|
254
|
+
stats.time_per_output_tokens_iter)
|
255
|
+
|
256
|
+
# Request level data
|
257
|
+
# Latency
|
258
|
+
self._log_histogram(self.metrics.histogram_e2e_time_request,
|
259
|
+
stats.time_e2e_requests)
|
260
|
+
# Metadata
|
261
|
+
finished_reason_counter = CollectionsCounter(
|
262
|
+
stats.finished_reason_requests)
|
263
|
+
self._log_counter_labels(self.metrics.counter_request_success,
|
264
|
+
finished_reason_counter,
|
265
|
+
Metrics.labelname_finish_reason)
|
266
|
+
self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
|
267
|
+
stats.num_prompt_tokens_requests)
|
268
|
+
self._log_histogram(
|
269
|
+
self.metrics.histogram_num_generation_tokens_request,
|
270
|
+
stats.num_generation_tokens_requests)
|
271
|
+
self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
|
272
|
+
self._log_histogram(self.metrics.histogram_best_of_request,
|
273
|
+
stats.best_of_requests)
|
274
|
+
|
275
|
+
def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None:
|
276
|
+
# Convenience function for logging to gauge.
|
277
|
+
gauge.labels(**self.labels).set(data)
|
278
|
+
|
279
|
+
def _log_counter(self, counter: Counter, data: Union[int, float]) -> None:
|
280
|
+
# Convenience function for logging to counter.
|
281
|
+
counter.labels(**self.labels).inc(data)
|
282
|
+
|
283
|
+
def _log_counter_labels(self, counter: Counter, data: CollectionsCounter,
|
284
|
+
label_key: str) -> None:
|
285
|
+
# Convenience function for collection counter of labels.
|
286
|
+
for label, count in data.items():
|
287
|
+
counter.labels(**{**self.labels, label_key: label}).inc(count)
|
288
|
+
|
289
|
+
def _log_histogram(self, histogram: Histogram,
|
290
|
+
data: Union[List[int], List[float]]) -> None:
|
291
|
+
# Convenience function for logging list to histogram.
|
292
|
+
for datum in data:
|
293
|
+
histogram.labels(**self.labels).observe(datum)
|
294
|
+
|
295
|
+
def _log_prometheus_interval(self, prompt_throughput: float,
|
296
|
+
generation_throughput: float) -> None:
|
297
|
+
# Logs metrics to prometheus that are computed every logging_interval.
|
298
|
+
# Support legacy gauge metrics that make throughput calculations on
|
299
|
+
# the vLLM side. Moving forward, we should use counters like
|
300
|
+
# counter_prompt_tokens, counter_generation_tokens
|
301
|
+
# Which log raw data and calculate summaries using rate() on the
|
302
|
+
# grafana/prometheus side. See
|
303
|
+
# https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
|
304
|
+
self.metrics.gauge_avg_prompt_throughput.labels(
|
305
|
+
**self.labels).set(prompt_throughput)
|
306
|
+
self.metrics.gauge_avg_generation_throughput.labels(
|
307
|
+
**self.labels).set(generation_throughput)
|
308
|
+
|
309
|
+
def log(self, stats: Stats) -> None:
|
310
|
+
"""Called by LLMEngine.
|
311
|
+
Logs to prometheus and tracked stats every iteration.
|
312
|
+
Logs to Stdout every self.local_interval seconds."""
|
313
|
+
|
314
|
+
# Log to prometheus.
|
315
|
+
self._log_prometheus(stats)
|
316
|
+
|
317
|
+
# Save tracked stats for token counters.
|
318
|
+
self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
|
319
|
+
self.num_generation_tokens.append(stats.num_generation_tokens_iter)
|
320
|
+
|
321
|
+
# Log locally every local_interval seconds.
|
322
|
+
if self._local_interval_elapsed(stats.now):
|
323
|
+
# Compute summary metrics for tracked stats (and log them
|
324
|
+
# to promethus if applicable).
|
325
|
+
prompt_throughput = self._get_throughput(self.num_prompt_tokens,
|
326
|
+
now=stats.now)
|
327
|
+
generation_throughput = self._get_throughput(
|
328
|
+
self.num_generation_tokens, now=stats.now)
|
329
|
+
self._log_prometheus_interval(
|
330
|
+
prompt_throughput=prompt_throughput,
|
331
|
+
generation_throughput=generation_throughput)
|
332
|
+
|
333
|
+
# Log to stdout.
|
334
|
+
logger.info(
|
335
|
+
"Avg prompt throughput: %.1f tokens/s, "
|
336
|
+
"Avg generation throughput: %.1f tokens/s, "
|
337
|
+
"Running: %d reqs, Swapped: %d reqs, "
|
338
|
+
"Pending: %d reqs, GPU KV cache usage: %.1f%%, "
|
339
|
+
"CPU KV cache usage: %.1f%%",
|
340
|
+
prompt_throughput,
|
341
|
+
generation_throughput,
|
342
|
+
stats.num_running_sys,
|
343
|
+
stats.num_swapped_sys,
|
344
|
+
stats.num_waiting_sys,
|
345
|
+
stats.gpu_cache_usage_sys * 100,
|
346
|
+
stats.cpu_cache_usage_sys * 100,
|
347
|
+
)
|
348
|
+
|
349
|
+
# Reset tracked stats for next interval.
|
350
|
+
self.num_prompt_tokens = []
|
351
|
+
self.num_generation_tokens = []
|
352
|
+
self.last_local_log = stats.now
|
353
|
+
|
354
|
+
if stats.spec_decode_metrics is not None:
|
355
|
+
logger.info(
|
356
|
+
self._format_spec_decode_metrics_str(
|
357
|
+
stats.spec_decode_metrics))
|
358
|
+
|
359
|
+
def _format_spec_decode_metrics_str(
|
360
|
+
self, metrics: "SpecDecodeWorkerMetrics") -> str:
|
361
|
+
|
362
|
+
return ("Speculative metrics: "
|
363
|
+
f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
|
364
|
+
f"System efficiency: {metrics.system_efficiency:.3f}, "
|
365
|
+
f"Number of speculative tokens: {metrics.num_spec_tokens}, "
|
366
|
+
f"Number of accepted tokens: {metrics.accepted_tokens}, "
|
367
|
+
f"Number of draft tokens tokens: {metrics.draft_tokens}, "
|
368
|
+
f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
|
File without changes
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Callable, List
|
3
|
+
|
4
|
+
from transformers import PreTrainedTokenizer
|
5
|
+
|
6
|
+
from vllm.config import SchedulerConfig
|
7
|
+
from vllm.core.scheduler import Scheduler
|
8
|
+
from vllm.engine.output_processor.stop_checker import StopChecker
|
9
|
+
from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
|
10
|
+
from vllm.transformers_utils.detokenizer import Detokenizer
|
11
|
+
from vllm.utils import Counter
|
12
|
+
|
13
|
+
|
14
|
+
class SequenceGroupOutputProcessor(ABC):
|
15
|
+
"""Interface for logic that processes new token ids in sequence groups,
|
16
|
+
managing detokenization, stop checking, and freeing/forking sequences with
|
17
|
+
the scheduler.
|
18
|
+
|
19
|
+
This is highly coupled with the LLMEngine and should be seen as an extension
|
20
|
+
of it. The logic is separated to simplify the LLMEngine class and allow
|
21
|
+
separate implementations for single-step decoding (which supports beam
|
22
|
+
search sequence forking) and multi-step decoding (which does not support
|
23
|
+
beam search, but does support speculative decoding).
|
24
|
+
"""
|
25
|
+
|
26
|
+
@staticmethod
|
27
|
+
def create_output_processor(
|
28
|
+
scheduler_config: SchedulerConfig,
|
29
|
+
detokenizer: Detokenizer,
|
30
|
+
scheduler: Scheduler,
|
31
|
+
seq_counter: Counter,
|
32
|
+
get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
|
33
|
+
stop_checker: "StopChecker",
|
34
|
+
):
|
35
|
+
"""Create an output processor.
|
36
|
+
|
37
|
+
This returns a single-step output processor if num_lookahead_slots is
|
38
|
+
zero, else returns a multi-step output processor.
|
39
|
+
"""
|
40
|
+
if scheduler_config.num_lookahead_slots == 0:
|
41
|
+
# Importing here to avoid cycle.
|
42
|
+
from vllm.engine.output_processor.single_step import (
|
43
|
+
SingleStepOutputProcessor)
|
44
|
+
return SingleStepOutputProcessor(
|
45
|
+
scheduler_config,
|
46
|
+
detokenizer,
|
47
|
+
scheduler,
|
48
|
+
seq_counter,
|
49
|
+
stop_checker,
|
50
|
+
)
|
51
|
+
else:
|
52
|
+
# Importing here to avoid cycle.
|
53
|
+
from vllm.engine.output_processor.multi_step import (
|
54
|
+
MultiStepOutputProcessor)
|
55
|
+
return MultiStepOutputProcessor(
|
56
|
+
detokenizer,
|
57
|
+
scheduler,
|
58
|
+
seq_counter,
|
59
|
+
get_tokenizer_for_seq,
|
60
|
+
stop_checker,
|
61
|
+
)
|
62
|
+
|
63
|
+
@abstractmethod
|
64
|
+
def process_outputs(self, sequence_group: SequenceGroup,
|
65
|
+
outputs: List[SequenceGroupOutput]) -> None:
|
66
|
+
"""Process new token ids for the sequence group. Handles logic such as
|
67
|
+
detokenization, stop checking, and freeing/forking sequences in the
|
68
|
+
scheduler.
|
69
|
+
"""
|
70
|
+
pass
|
71
|
+
|
72
|
+
@abstractmethod
|
73
|
+
def process_prompt_logprob(self, seq_group: SequenceGroup,
|
74
|
+
outputs: List[SequenceGroupOutput]) -> None:
|
75
|
+
"""Update prompt logprobs received from outputs to seq_group."""
|
76
|
+
pass
|
@@ -0,0 +1,142 @@
|
|
1
|
+
import functools
|
2
|
+
from typing import Callable, List
|
3
|
+
|
4
|
+
from transformers import PreTrainedTokenizer
|
5
|
+
|
6
|
+
from vllm.core.scheduler import Scheduler
|
7
|
+
from vllm.engine.output_processor.interfaces import (
|
8
|
+
SequenceGroupOutputProcessor)
|
9
|
+
from vllm.engine.output_processor.stop_checker import StopChecker
|
10
|
+
from vllm.logger import init_logger
|
11
|
+
from vllm.sampling_params import SamplingParams
|
12
|
+
from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
|
13
|
+
SequenceOutput, SequenceStatus)
|
14
|
+
from vllm.transformers_utils.detokenizer import Detokenizer
|
15
|
+
from vllm.utils import Counter
|
16
|
+
|
17
|
+
logger = init_logger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
|
21
|
+
"""SequenceGroupOutputProcessor which handles logic related to
|
22
|
+
detokenization and stopping conditions. It specializes to "multi-step
|
23
|
+
decoding", where vLLM's worker may generate multiple tokens per invocation.
|
24
|
+
This is currently mutually exclusive with advanced sampling techniques like
|
25
|
+
beam search, which motivates the separation of this logic from the single
|
26
|
+
step output processor.
|
27
|
+
|
28
|
+
This class is responsible for things such as correctly appending all new
|
29
|
+
token ids to their sequence, detokenizing new token ids, truncating new
|
30
|
+
output tokens after an eos token, and correctly handling the case where the
|
31
|
+
number of new output tokens per sequence differs in a single batch.
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(
|
35
|
+
self,
|
36
|
+
detokenizer: Detokenizer,
|
37
|
+
scheduler: Scheduler,
|
38
|
+
seq_counter: Counter,
|
39
|
+
get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
|
40
|
+
stop_checker: StopChecker,
|
41
|
+
):
|
42
|
+
self.detokenizer = detokenizer
|
43
|
+
self.scheduler = scheduler
|
44
|
+
self.seq_counter = seq_counter
|
45
|
+
self.get_tokenizer_for_seq = get_tokenizer_for_seq
|
46
|
+
self.stop_checker = stop_checker
|
47
|
+
|
48
|
+
def process_prompt_logprob(self, seq_group: SequenceGroup,
|
49
|
+
outputs: List[SequenceGroupOutput]) -> None:
|
50
|
+
# TODO(sang): Prompt logprob currently not implemented in multi step
|
51
|
+
# workers.
|
52
|
+
self._log_prompt_logprob_unsupported_warning_once()
|
53
|
+
|
54
|
+
@staticmethod
|
55
|
+
@functools.lru_cache()
|
56
|
+
def _log_prompt_logprob_unsupported_warning_once():
|
57
|
+
logger.warning(
|
58
|
+
"Prompt logprob is not supported by multi step workers. "
|
59
|
+
"(e.g., speculative decode uses multi step workers).")
|
60
|
+
|
61
|
+
def process_outputs(self, sequence_group: SequenceGroup,
|
62
|
+
outputs: List[SequenceGroupOutput]) -> None:
|
63
|
+
"""Append new tokens in the outputs to sequences in the sequence group.
|
64
|
+
|
65
|
+
This only supports sequence groups of size 1. It supports greater than
|
66
|
+
one new token per sequence.
|
67
|
+
|
68
|
+
This applies logic like stop condition checking and detokenization,
|
69
|
+
including freeing finished sequences. It also handles cases where there
|
70
|
+
are tokens emitted after the EOS token.
|
71
|
+
"""
|
72
|
+
seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
|
73
|
+
|
74
|
+
assert seqs, "expected running sequences"
|
75
|
+
assert len(seqs) == 1, (
|
76
|
+
"Beam search not supported in multi-step decoding.")
|
77
|
+
seq = seqs[0]
|
78
|
+
|
79
|
+
# Since there's only one sequence per sequence group, we can take the
|
80
|
+
# first sample.
|
81
|
+
samples = [outputs[step].samples[0] for step in range(len(outputs))]
|
82
|
+
|
83
|
+
# -1 means the output token is not valid (eg. due to spec decode
|
84
|
+
# rejecting tokens).
|
85
|
+
valid_samples = [
|
86
|
+
sample for sample in samples if sample.output_token != -1
|
87
|
+
]
|
88
|
+
assert valid_samples
|
89
|
+
|
90
|
+
self._process_seq_outputs(seq, valid_samples,
|
91
|
+
sequence_group.sampling_params)
|
92
|
+
|
93
|
+
def _process_seq_outputs(self, seq: Sequence,
|
94
|
+
valid_samples: List[SequenceOutput],
|
95
|
+
sampling_params: SamplingParams) -> None:
|
96
|
+
output_token_ids = [sample.output_token for sample in valid_samples]
|
97
|
+
output_logprobs = [sample.logprobs for sample in valid_samples]
|
98
|
+
|
99
|
+
# Truncate to max_tokens if necessary.
|
100
|
+
remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
|
101
|
+
len(output_token_ids))
|
102
|
+
if remaining_tokens < 0:
|
103
|
+
valid_samples = valid_samples[:remaining_tokens]
|
104
|
+
output_token_ids = output_token_ids[:remaining_tokens]
|
105
|
+
|
106
|
+
# Truncate any tokens after EOS. This is required as spec decode
|
107
|
+
# generates a fixed number of tokens without evaluating stopping
|
108
|
+
# conditions within the block. This can cause an eos token to be
|
109
|
+
# unintentionally ignored.
|
110
|
+
if not sampling_params.ignore_eos:
|
111
|
+
eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
|
112
|
+
# Avoiding .index calls as exception throwing in the happy path
|
113
|
+
# is expensive.
|
114
|
+
for i in range(len(output_token_ids)):
|
115
|
+
if output_token_ids[i] == eos_token_id:
|
116
|
+
output_token_ids = output_token_ids[:i + 1]
|
117
|
+
valid_samples = valid_samples[:i + 1]
|
118
|
+
break
|
119
|
+
|
120
|
+
# Incrementally append tokens to the sequence, as if we had only one new
|
121
|
+
# token.
|
122
|
+
for output_token_id, output_logprob in zip(output_token_ids,
|
123
|
+
output_logprobs):
|
124
|
+
seq.append_token_id(
|
125
|
+
token_id=output_token_id,
|
126
|
+
logprobs=output_logprob,
|
127
|
+
)
|
128
|
+
|
129
|
+
new_char_count = 0
|
130
|
+
if sampling_params.detokenize:
|
131
|
+
new_char_count = self.detokenizer.decode_sequence_inplace(
|
132
|
+
seq, sampling_params)
|
133
|
+
|
134
|
+
self.stop_checker.maybe_stop_sequence(
|
135
|
+
seq,
|
136
|
+
new_char_count=new_char_count,
|
137
|
+
sampling_params=sampling_params)
|
138
|
+
if seq.is_finished():
|
139
|
+
break
|
140
|
+
|
141
|
+
if seq.is_finished():
|
142
|
+
self.scheduler.free_seq(seq)
|