vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
vllm/engine/metrics.py ADDED
@@ -0,0 +1,368 @@
1
+ import time
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING
4
+ from typing import Counter as CollectionsCounter
5
+ from typing import Dict, List, Optional, Protocol, Union
6
+
7
+ import numpy as np
8
+ from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
9
+ disable_created_metrics)
10
+
11
+ from vllm.logger import init_logger
12
+
13
+ if TYPE_CHECKING:
14
+ from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
15
+
16
+ logger = init_logger(__name__)
17
+
18
+ disable_created_metrics()
19
+
20
+ # The begin-* and end* here are used by the documentation generator
21
+ # to extract the metrics definitions.
22
+
23
+
24
+ # begin-metrics-definitions
25
+ class Metrics:
26
+ labelname_finish_reason = "finished_reason"
27
+
28
+ def __init__(self, labelnames: List[str], max_model_len: int):
29
+ # Unregister any existing vLLM collectors
30
+ for collector in list(REGISTRY._collector_to_names):
31
+ if hasattr(collector, "_name") and "vllm" in collector._name:
32
+ REGISTRY.unregister(collector)
33
+
34
+ # Config Information
35
+ self.info_cache_config = Info(
36
+ name='vllm:cache_config',
37
+ documentation='information of cache_config')
38
+
39
+ # System stats
40
+ # Scheduler State
41
+ self.gauge_scheduler_running = Gauge(
42
+ name="vllm:num_requests_running",
43
+ documentation="Number of requests currently running on GPU.",
44
+ labelnames=labelnames)
45
+ self.gauge_scheduler_waiting = Gauge(
46
+ name="vllm:num_requests_waiting",
47
+ documentation="Number of requests waiting to be processed.",
48
+ labelnames=labelnames)
49
+ self.gauge_scheduler_swapped = Gauge(
50
+ name="vllm:num_requests_swapped",
51
+ documentation="Number of requests swapped to CPU.",
52
+ labelnames=labelnames)
53
+ # KV Cache Usage in %
54
+ self.gauge_gpu_cache_usage = Gauge(
55
+ name="vllm:gpu_cache_usage_perc",
56
+ documentation="GPU KV-cache usage. 1 means 100 percent usage.",
57
+ labelnames=labelnames)
58
+ self.gauge_cpu_cache_usage = Gauge(
59
+ name="vllm:cpu_cache_usage_perc",
60
+ documentation="CPU KV-cache usage. 1 means 100 percent usage.",
61
+ labelnames=labelnames)
62
+
63
+ # Iteration stats
64
+ self.counter_prompt_tokens = Counter(
65
+ name="vllm:prompt_tokens_total",
66
+ documentation="Number of prefill tokens processed.",
67
+ labelnames=labelnames)
68
+ self.counter_generation_tokens = Counter(
69
+ name="vllm:generation_tokens_total",
70
+ documentation="Number of generation tokens processed.",
71
+ labelnames=labelnames)
72
+ self.histogram_time_to_first_token = Histogram(
73
+ name="vllm:time_to_first_token_seconds",
74
+ documentation="Histogram of time to first token in seconds.",
75
+ labelnames=labelnames,
76
+ buckets=[
77
+ 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
78
+ 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
79
+ ])
80
+ self.histogram_time_per_output_token = Histogram(
81
+ name="vllm:time_per_output_token_seconds",
82
+ documentation="Histogram of time per output token in seconds.",
83
+ labelnames=labelnames,
84
+ buckets=[
85
+ 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
86
+ 1.0, 2.5
87
+ ])
88
+
89
+ # Request stats
90
+ # Latency
91
+ self.histogram_e2e_time_request = Histogram(
92
+ name="vllm:e2e_request_latency_seconds",
93
+ documentation="Histogram of end to end request latency in seconds.",
94
+ labelnames=labelnames,
95
+ buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
96
+ # Metadata
97
+ self.histogram_num_prompt_tokens_request = Histogram(
98
+ name="vllm:request_prompt_tokens",
99
+ documentation="Number of prefill tokens processed.",
100
+ labelnames=labelnames,
101
+ buckets=build_1_2_5_buckets(max_model_len),
102
+ )
103
+ self.histogram_num_generation_tokens_request = Histogram(
104
+ name="vllm:request_generation_tokens",
105
+ documentation="Number of generation tokens processed.",
106
+ labelnames=labelnames,
107
+ buckets=build_1_2_5_buckets(max_model_len),
108
+ )
109
+ self.histogram_best_of_request = Histogram(
110
+ name="vllm:request_params_best_of",
111
+ documentation="Histogram of the best_of request parameter.",
112
+ labelnames=labelnames,
113
+ buckets=[1, 2, 5, 10, 20],
114
+ )
115
+ self.histogram_n_request = Histogram(
116
+ name="vllm:request_params_n",
117
+ documentation="Histogram of the n request parameter.",
118
+ labelnames=labelnames,
119
+ buckets=[1, 2, 5, 10, 20],
120
+ )
121
+ self.counter_request_success = Counter(
122
+ name="vllm:request_success_total",
123
+ documentation="Count of successfully processed requests.",
124
+ labelnames=labelnames + [Metrics.labelname_finish_reason])
125
+
126
+ # Deprecated in favor of vllm:prompt_tokens_total
127
+ self.gauge_avg_prompt_throughput = Gauge(
128
+ name="vllm:avg_prompt_throughput_toks_per_s",
129
+ documentation="Average prefill throughput in tokens/s.",
130
+ labelnames=labelnames,
131
+ )
132
+ # Deprecated in favor of vllm:generation_tokens_total
133
+ self.gauge_avg_generation_throughput = Gauge(
134
+ name="vllm:avg_generation_throughput_toks_per_s",
135
+ documentation="Average generation throughput in tokens/s.",
136
+ labelnames=labelnames,
137
+ )
138
+
139
+
140
+ # end-metrics-definitions
141
+
142
+
143
+ def build_1_2_5_buckets(max_value: int):
144
+ """
145
+ Builds a list of buckets with increasing powers of 10 multiplied by
146
+ mantissa values (1, 2, 5) until the value exceeds the specified maximum.
147
+
148
+ Example:
149
+ >>> build_1_2_5_buckets(100)
150
+ [1, 2, 5, 10, 20, 50, 100]
151
+ """
152
+ mantissa_lst = [1, 2, 5]
153
+ exponent = 0
154
+ buckets = []
155
+ while True:
156
+ for m in mantissa_lst:
157
+ value = m * 10**exponent
158
+ if value <= max_value:
159
+ buckets.append(value)
160
+ else:
161
+ return buckets
162
+ exponent += 1
163
+
164
+
165
+ @dataclass
166
+ class Stats:
167
+ """Created by LLMEngine for use by StatLogger."""
168
+ now: float
169
+
170
+ # System stats (should have _sys suffix)
171
+ # Scheduler State
172
+ num_running_sys: int
173
+ num_waiting_sys: int
174
+ num_swapped_sys: int
175
+ # KV Cache Usage in %
176
+ gpu_cache_usage_sys: float
177
+ cpu_cache_usage_sys: float
178
+
179
+ # Iteration stats (should have _iter suffix)
180
+ num_prompt_tokens_iter: int
181
+ num_generation_tokens_iter: int
182
+ time_to_first_tokens_iter: List[float]
183
+ time_per_output_tokens_iter: List[float]
184
+
185
+ # Request stats (should have _requests suffix)
186
+ # Latency
187
+ time_e2e_requests: List[float]
188
+ # Metadata
189
+ num_prompt_tokens_requests: List[int]
190
+ num_generation_tokens_requests: List[int]
191
+ best_of_requests: List[int]
192
+ n_requests: List[int]
193
+ finished_reason_requests: List[str]
194
+
195
+ spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
196
+
197
+
198
+ class SupportsMetricsInfo(Protocol):
199
+
200
+ def metrics_info(self) -> Dict[str, str]:
201
+ ...
202
+
203
+
204
+ class StatLogger:
205
+ """StatLogger is used LLMEngine to log to Promethus and Stdout."""
206
+
207
+ def __init__(self, local_interval: float, labels: Dict[str, str],
208
+ max_model_len: int) -> None:
209
+ # Metadata for logging locally.
210
+ self.last_local_log = time.time()
211
+ self.local_interval = local_interval
212
+
213
+ # Tracked stats over current local logging interval.
214
+ self.num_prompt_tokens: List[int] = []
215
+ self.num_generation_tokens: List[int] = []
216
+
217
+ # Prometheus metrics
218
+ self.labels = labels
219
+ self.metrics = Metrics(labelnames=list(labels.keys()),
220
+ max_model_len=max_model_len)
221
+
222
+ def info(self, type: str, obj: SupportsMetricsInfo) -> None:
223
+ if type == "cache_config":
224
+ self.metrics.info_cache_config.info(obj.metrics_info())
225
+
226
+ def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
227
+ return float(np.sum(tracked_stats) / (now - self.last_local_log))
228
+
229
+ def _local_interval_elapsed(self, now: float) -> bool:
230
+ elapsed_time = now - self.last_local_log
231
+ return elapsed_time > self.local_interval
232
+
233
+ def _log_prometheus(self, stats: Stats) -> None:
234
+ # System state data
235
+ self._log_gauge(self.metrics.gauge_scheduler_running,
236
+ stats.num_running_sys)
237
+ self._log_gauge(self.metrics.gauge_scheduler_swapped,
238
+ stats.num_swapped_sys)
239
+ self._log_gauge(self.metrics.gauge_scheduler_waiting,
240
+ stats.num_waiting_sys)
241
+ self._log_gauge(self.metrics.gauge_gpu_cache_usage,
242
+ stats.gpu_cache_usage_sys)
243
+ self._log_gauge(self.metrics.gauge_cpu_cache_usage,
244
+ stats.cpu_cache_usage_sys)
245
+
246
+ # Iteration level data
247
+ self._log_counter(self.metrics.counter_prompt_tokens,
248
+ stats.num_prompt_tokens_iter)
249
+ self._log_counter(self.metrics.counter_generation_tokens,
250
+ stats.num_generation_tokens_iter)
251
+ self._log_histogram(self.metrics.histogram_time_to_first_token,
252
+ stats.time_to_first_tokens_iter)
253
+ self._log_histogram(self.metrics.histogram_time_per_output_token,
254
+ stats.time_per_output_tokens_iter)
255
+
256
+ # Request level data
257
+ # Latency
258
+ self._log_histogram(self.metrics.histogram_e2e_time_request,
259
+ stats.time_e2e_requests)
260
+ # Metadata
261
+ finished_reason_counter = CollectionsCounter(
262
+ stats.finished_reason_requests)
263
+ self._log_counter_labels(self.metrics.counter_request_success,
264
+ finished_reason_counter,
265
+ Metrics.labelname_finish_reason)
266
+ self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
267
+ stats.num_prompt_tokens_requests)
268
+ self._log_histogram(
269
+ self.metrics.histogram_num_generation_tokens_request,
270
+ stats.num_generation_tokens_requests)
271
+ self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
272
+ self._log_histogram(self.metrics.histogram_best_of_request,
273
+ stats.best_of_requests)
274
+
275
+ def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None:
276
+ # Convenience function for logging to gauge.
277
+ gauge.labels(**self.labels).set(data)
278
+
279
+ def _log_counter(self, counter: Counter, data: Union[int, float]) -> None:
280
+ # Convenience function for logging to counter.
281
+ counter.labels(**self.labels).inc(data)
282
+
283
+ def _log_counter_labels(self, counter: Counter, data: CollectionsCounter,
284
+ label_key: str) -> None:
285
+ # Convenience function for collection counter of labels.
286
+ for label, count in data.items():
287
+ counter.labels(**{**self.labels, label_key: label}).inc(count)
288
+
289
+ def _log_histogram(self, histogram: Histogram,
290
+ data: Union[List[int], List[float]]) -> None:
291
+ # Convenience function for logging list to histogram.
292
+ for datum in data:
293
+ histogram.labels(**self.labels).observe(datum)
294
+
295
+ def _log_prometheus_interval(self, prompt_throughput: float,
296
+ generation_throughput: float) -> None:
297
+ # Logs metrics to prometheus that are computed every logging_interval.
298
+ # Support legacy gauge metrics that make throughput calculations on
299
+ # the vLLM side. Moving forward, we should use counters like
300
+ # counter_prompt_tokens, counter_generation_tokens
301
+ # Which log raw data and calculate summaries using rate() on the
302
+ # grafana/prometheus side. See
303
+ # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
304
+ self.metrics.gauge_avg_prompt_throughput.labels(
305
+ **self.labels).set(prompt_throughput)
306
+ self.metrics.gauge_avg_generation_throughput.labels(
307
+ **self.labels).set(generation_throughput)
308
+
309
+ def log(self, stats: Stats) -> None:
310
+ """Called by LLMEngine.
311
+ Logs to prometheus and tracked stats every iteration.
312
+ Logs to Stdout every self.local_interval seconds."""
313
+
314
+ # Log to prometheus.
315
+ self._log_prometheus(stats)
316
+
317
+ # Save tracked stats for token counters.
318
+ self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
319
+ self.num_generation_tokens.append(stats.num_generation_tokens_iter)
320
+
321
+ # Log locally every local_interval seconds.
322
+ if self._local_interval_elapsed(stats.now):
323
+ # Compute summary metrics for tracked stats (and log them
324
+ # to promethus if applicable).
325
+ prompt_throughput = self._get_throughput(self.num_prompt_tokens,
326
+ now=stats.now)
327
+ generation_throughput = self._get_throughput(
328
+ self.num_generation_tokens, now=stats.now)
329
+ self._log_prometheus_interval(
330
+ prompt_throughput=prompt_throughput,
331
+ generation_throughput=generation_throughput)
332
+
333
+ # Log to stdout.
334
+ logger.info(
335
+ "Avg prompt throughput: %.1f tokens/s, "
336
+ "Avg generation throughput: %.1f tokens/s, "
337
+ "Running: %d reqs, Swapped: %d reqs, "
338
+ "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
339
+ "CPU KV cache usage: %.1f%%",
340
+ prompt_throughput,
341
+ generation_throughput,
342
+ stats.num_running_sys,
343
+ stats.num_swapped_sys,
344
+ stats.num_waiting_sys,
345
+ stats.gpu_cache_usage_sys * 100,
346
+ stats.cpu_cache_usage_sys * 100,
347
+ )
348
+
349
+ # Reset tracked stats for next interval.
350
+ self.num_prompt_tokens = []
351
+ self.num_generation_tokens = []
352
+ self.last_local_log = stats.now
353
+
354
+ if stats.spec_decode_metrics is not None:
355
+ logger.info(
356
+ self._format_spec_decode_metrics_str(
357
+ stats.spec_decode_metrics))
358
+
359
+ def _format_spec_decode_metrics_str(
360
+ self, metrics: "SpecDecodeWorkerMetrics") -> str:
361
+
362
+ return ("Speculative metrics: "
363
+ f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
364
+ f"System efficiency: {metrics.system_efficiency:.3f}, "
365
+ f"Number of speculative tokens: {metrics.num_spec_tokens}, "
366
+ f"Number of accepted tokens: {metrics.accepted_tokens}, "
367
+ f"Number of draft tokens tokens: {metrics.draft_tokens}, "
368
+ f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
File without changes
@@ -0,0 +1,76 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Callable, List
3
+
4
+ from transformers import PreTrainedTokenizer
5
+
6
+ from vllm.config import SchedulerConfig
7
+ from vllm.core.scheduler import Scheduler
8
+ from vllm.engine.output_processor.stop_checker import StopChecker
9
+ from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
10
+ from vllm.transformers_utils.detokenizer import Detokenizer
11
+ from vllm.utils import Counter
12
+
13
+
14
+ class SequenceGroupOutputProcessor(ABC):
15
+ """Interface for logic that processes new token ids in sequence groups,
16
+ managing detokenization, stop checking, and freeing/forking sequences with
17
+ the scheduler.
18
+
19
+ This is highly coupled with the LLMEngine and should be seen as an extension
20
+ of it. The logic is separated to simplify the LLMEngine class and allow
21
+ separate implementations for single-step decoding (which supports beam
22
+ search sequence forking) and multi-step decoding (which does not support
23
+ beam search, but does support speculative decoding).
24
+ """
25
+
26
+ @staticmethod
27
+ def create_output_processor(
28
+ scheduler_config: SchedulerConfig,
29
+ detokenizer: Detokenizer,
30
+ scheduler: Scheduler,
31
+ seq_counter: Counter,
32
+ get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
33
+ stop_checker: "StopChecker",
34
+ ):
35
+ """Create an output processor.
36
+
37
+ This returns a single-step output processor if num_lookahead_slots is
38
+ zero, else returns a multi-step output processor.
39
+ """
40
+ if scheduler_config.num_lookahead_slots == 0:
41
+ # Importing here to avoid cycle.
42
+ from vllm.engine.output_processor.single_step import (
43
+ SingleStepOutputProcessor)
44
+ return SingleStepOutputProcessor(
45
+ scheduler_config,
46
+ detokenizer,
47
+ scheduler,
48
+ seq_counter,
49
+ stop_checker,
50
+ )
51
+ else:
52
+ # Importing here to avoid cycle.
53
+ from vllm.engine.output_processor.multi_step import (
54
+ MultiStepOutputProcessor)
55
+ return MultiStepOutputProcessor(
56
+ detokenizer,
57
+ scheduler,
58
+ seq_counter,
59
+ get_tokenizer_for_seq,
60
+ stop_checker,
61
+ )
62
+
63
+ @abstractmethod
64
+ def process_outputs(self, sequence_group: SequenceGroup,
65
+ outputs: List[SequenceGroupOutput]) -> None:
66
+ """Process new token ids for the sequence group. Handles logic such as
67
+ detokenization, stop checking, and freeing/forking sequences in the
68
+ scheduler.
69
+ """
70
+ pass
71
+
72
+ @abstractmethod
73
+ def process_prompt_logprob(self, seq_group: SequenceGroup,
74
+ outputs: List[SequenceGroupOutput]) -> None:
75
+ """Update prompt logprobs received from outputs to seq_group."""
76
+ pass
@@ -0,0 +1,142 @@
1
+ import functools
2
+ from typing import Callable, List
3
+
4
+ from transformers import PreTrainedTokenizer
5
+
6
+ from vllm.core.scheduler import Scheduler
7
+ from vllm.engine.output_processor.interfaces import (
8
+ SequenceGroupOutputProcessor)
9
+ from vllm.engine.output_processor.stop_checker import StopChecker
10
+ from vllm.logger import init_logger
11
+ from vllm.sampling_params import SamplingParams
12
+ from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
13
+ SequenceOutput, SequenceStatus)
14
+ from vllm.transformers_utils.detokenizer import Detokenizer
15
+ from vllm.utils import Counter
16
+
17
+ logger = init_logger(__name__)
18
+
19
+
20
+ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
21
+ """SequenceGroupOutputProcessor which handles logic related to
22
+ detokenization and stopping conditions. It specializes to "multi-step
23
+ decoding", where vLLM's worker may generate multiple tokens per invocation.
24
+ This is currently mutually exclusive with advanced sampling techniques like
25
+ beam search, which motivates the separation of this logic from the single
26
+ step output processor.
27
+
28
+ This class is responsible for things such as correctly appending all new
29
+ token ids to their sequence, detokenizing new token ids, truncating new
30
+ output tokens after an eos token, and correctly handling the case where the
31
+ number of new output tokens per sequence differs in a single batch.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ detokenizer: Detokenizer,
37
+ scheduler: Scheduler,
38
+ seq_counter: Counter,
39
+ get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
40
+ stop_checker: StopChecker,
41
+ ):
42
+ self.detokenizer = detokenizer
43
+ self.scheduler = scheduler
44
+ self.seq_counter = seq_counter
45
+ self.get_tokenizer_for_seq = get_tokenizer_for_seq
46
+ self.stop_checker = stop_checker
47
+
48
+ def process_prompt_logprob(self, seq_group: SequenceGroup,
49
+ outputs: List[SequenceGroupOutput]) -> None:
50
+ # TODO(sang): Prompt logprob currently not implemented in multi step
51
+ # workers.
52
+ self._log_prompt_logprob_unsupported_warning_once()
53
+
54
+ @staticmethod
55
+ @functools.lru_cache()
56
+ def _log_prompt_logprob_unsupported_warning_once():
57
+ logger.warning(
58
+ "Prompt logprob is not supported by multi step workers. "
59
+ "(e.g., speculative decode uses multi step workers).")
60
+
61
+ def process_outputs(self, sequence_group: SequenceGroup,
62
+ outputs: List[SequenceGroupOutput]) -> None:
63
+ """Append new tokens in the outputs to sequences in the sequence group.
64
+
65
+ This only supports sequence groups of size 1. It supports greater than
66
+ one new token per sequence.
67
+
68
+ This applies logic like stop condition checking and detokenization,
69
+ including freeing finished sequences. It also handles cases where there
70
+ are tokens emitted after the EOS token.
71
+ """
72
+ seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
73
+
74
+ assert seqs, "expected running sequences"
75
+ assert len(seqs) == 1, (
76
+ "Beam search not supported in multi-step decoding.")
77
+ seq = seqs[0]
78
+
79
+ # Since there's only one sequence per sequence group, we can take the
80
+ # first sample.
81
+ samples = [outputs[step].samples[0] for step in range(len(outputs))]
82
+
83
+ # -1 means the output token is not valid (eg. due to spec decode
84
+ # rejecting tokens).
85
+ valid_samples = [
86
+ sample for sample in samples if sample.output_token != -1
87
+ ]
88
+ assert valid_samples
89
+
90
+ self._process_seq_outputs(seq, valid_samples,
91
+ sequence_group.sampling_params)
92
+
93
+ def _process_seq_outputs(self, seq: Sequence,
94
+ valid_samples: List[SequenceOutput],
95
+ sampling_params: SamplingParams) -> None:
96
+ output_token_ids = [sample.output_token for sample in valid_samples]
97
+ output_logprobs = [sample.logprobs for sample in valid_samples]
98
+
99
+ # Truncate to max_tokens if necessary.
100
+ remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
101
+ len(output_token_ids))
102
+ if remaining_tokens < 0:
103
+ valid_samples = valid_samples[:remaining_tokens]
104
+ output_token_ids = output_token_ids[:remaining_tokens]
105
+
106
+ # Truncate any tokens after EOS. This is required as spec decode
107
+ # generates a fixed number of tokens without evaluating stopping
108
+ # conditions within the block. This can cause an eos token to be
109
+ # unintentionally ignored.
110
+ if not sampling_params.ignore_eos:
111
+ eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
112
+ # Avoiding .index calls as exception throwing in the happy path
113
+ # is expensive.
114
+ for i in range(len(output_token_ids)):
115
+ if output_token_ids[i] == eos_token_id:
116
+ output_token_ids = output_token_ids[:i + 1]
117
+ valid_samples = valid_samples[:i + 1]
118
+ break
119
+
120
+ # Incrementally append tokens to the sequence, as if we had only one new
121
+ # token.
122
+ for output_token_id, output_logprob in zip(output_token_ids,
123
+ output_logprobs):
124
+ seq.append_token_id(
125
+ token_id=output_token_id,
126
+ logprobs=output_logprob,
127
+ )
128
+
129
+ new_char_count = 0
130
+ if sampling_params.detokenize:
131
+ new_char_count = self.detokenizer.decode_sequence_inplace(
132
+ seq, sampling_params)
133
+
134
+ self.stop_checker.maybe_stop_sequence(
135
+ seq,
136
+ new_char_count=new_char_count,
137
+ sampling_params=sampling_params)
138
+ if seq.is_finished():
139
+ break
140
+
141
+ if seq.is_finished():
142
+ self.scheduler.free_seq(seq)