PyPI - vllm-npu - Versions diffs - 0.4.2__py3-none-any.whl - Mend

vllm-npu 0.4.2__py3-none-any.whl

Files changed (219) hide show

vllm/__init__.py +23 -0
vllm/_custom_ops.py +251 -0
vllm/attention/__init__.py +13 -0
vllm/attention/backends/__init__.py +0 -0
vllm/attention/backends/abstract.py +127 -0
vllm/attention/backends/flash_attn.py +271 -0
vllm/attention/backends/flashinfer.py +220 -0
vllm/attention/backends/rocm_flash_attn.py +374 -0
vllm/attention/backends/torch_sdpa.py +250 -0
vllm/attention/backends/xformers.py +393 -0
vllm/attention/layer.py +56 -0
vllm/attention/ops/__init__.py +0 -0
vllm/attention/ops/paged_attn.py +216 -0
vllm/attention/ops/prefix_prefill.py +792 -0
vllm/attention/ops/triton_flash_attention.py +810 -0
vllm/attention/selector.py +91 -0
vllm/block.py +84 -0
vllm/config.py +1225 -0
vllm/core/__init__.py +0 -0
vllm/core/block/__init__.py +0 -0
vllm/core/block/block_table.py +295 -0
vllm/core/block/common.py +199 -0
vllm/core/block/cpu_gpu_block_allocator.py +228 -0
vllm/core/block/interfaces.py +205 -0
vllm/core/block/naive_block.py +318 -0
vllm/core/block/prefix_caching_block.py +606 -0
vllm/core/block_manager_v1.py +625 -0
vllm/core/block_manager_v2.py +258 -0
vllm/core/evictor_v1.py +105 -0
vllm/core/evictor_v2.py +127 -0
vllm/core/interfaces.py +113 -0
vllm/core/policy.py +45 -0
vllm/core/scheduler.py +1163 -0
vllm/distributed/__init__.py +3 -0
vllm/distributed/communication_op.py +237 -0
vllm/distributed/device_communicators/__init__.py +0 -0
vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
vllm/distributed/device_communicators/pynccl.py +287 -0
vllm/distributed/device_communicators/pynccl_utils.py +66 -0
vllm/distributed/parallel_state.py +339 -0
vllm/distributed/utils.py +136 -0
vllm/engine/__init__.py +0 -0
vllm/engine/arg_utils.py +649 -0
vllm/engine/async_llm_engine.py +737 -0
vllm/engine/llm_engine.py +784 -0
vllm/engine/metrics.py +368 -0
vllm/engine/output_processor/__init__.py +0 -0
vllm/engine/output_processor/interfaces.py +76 -0
vllm/engine/output_processor/multi_step.py +142 -0
vllm/engine/output_processor/single_step.py +284 -0
vllm/engine/output_processor/stop_checker.py +101 -0
vllm/engine/output_processor/util.py +19 -0
vllm/entrypoints/__init__.py +0 -0
vllm/entrypoints/api_server.py +119 -0
vllm/entrypoints/llm.py +259 -0
vllm/entrypoints/openai/__init__.py +0 -0
vllm/entrypoints/openai/api_server.py +186 -0
vllm/entrypoints/openai/cli_args.py +115 -0
vllm/entrypoints/openai/protocol.py +460 -0
vllm/entrypoints/openai/serving_chat.py +392 -0
vllm/entrypoints/openai/serving_completion.py +347 -0
vllm/entrypoints/openai/serving_engine.py +234 -0
vllm/envs.py +217 -0
vllm/executor/__init__.py +0 -0
vllm/executor/cpu_executor.py +152 -0
vllm/executor/distributed_gpu_executor.py +115 -0
vllm/executor/executor_base.py +115 -0
vllm/executor/gpu_executor.py +150 -0
vllm/executor/multiproc_worker_utils.py +263 -0
vllm/executor/neuron_executor.py +91 -0
vllm/executor/ray_gpu_executor.py +327 -0
vllm/executor/ray_utils.py +119 -0
vllm/logger.py +153 -0
vllm/logging/__init__.py +5 -0
vllm/logging/formatter.py +15 -0
vllm/lora/__init__.py +0 -0
vllm/lora/fully_sharded_layers.py +262 -0
vllm/lora/layers.py +1181 -0
vllm/lora/lora.py +167 -0
vllm/lora/models.py +645 -0
vllm/lora/punica.py +213 -0
vllm/lora/request.py +32 -0
vllm/lora/utils.py +98 -0
vllm/lora/worker_manager.py +251 -0
vllm/model_executor/__init__.py +7 -0
vllm/model_executor/guided_decoding/__init__.py +25 -0
vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
vllm/model_executor/layers/__init__.py +0 -0
vllm/model_executor/layers/activation.py +173 -0
vllm/model_executor/layers/fused_moe/__init__.py +7 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
vllm/model_executor/layers/layernorm.py +71 -0
vllm/model_executor/layers/linear.py +709 -0
vllm/model_executor/layers/logits_processor.py +115 -0
vllm/model_executor/layers/ops/__init__.py +0 -0
vllm/model_executor/layers/ops/rand.py +157 -0
vllm/model_executor/layers/ops/sample.py +406 -0
vllm/model_executor/layers/quantization/__init__.py +35 -0
vllm/model_executor/layers/quantization/aqlm.py +376 -0
vllm/model_executor/layers/quantization/awq.py +175 -0
vllm/model_executor/layers/quantization/base_config.py +97 -0
vllm/model_executor/layers/quantization/fp8.py +265 -0
vllm/model_executor/layers/quantization/gptq.py +224 -0
vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
vllm/model_executor/layers/quantization/marlin.py +227 -0
vllm/model_executor/layers/quantization/schema.py +84 -0
vllm/model_executor/layers/quantization/squeezellm.py +137 -0
vllm/model_executor/layers/rejection_sampler.py +405 -0
vllm/model_executor/layers/rotary_embedding.py +525 -0
vllm/model_executor/layers/sampler.py +1051 -0
vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
vllm/model_executor/model_loader/__init__.py +30 -0
vllm/model_executor/model_loader/loader.py +362 -0
vllm/model_executor/model_loader/neuron.py +136 -0
vllm/model_executor/model_loader/tensorizer.py +368 -0
vllm/model_executor/model_loader/utils.py +41 -0
vllm/model_executor/model_loader/weight_utils.py +372 -0
vllm/model_executor/models/__init__.py +119 -0
vllm/model_executor/models/baichuan.py +410 -0
vllm/model_executor/models/bloom.py +327 -0
vllm/model_executor/models/chatglm.py +386 -0
vllm/model_executor/models/commandr.py +373 -0
vllm/model_executor/models/dbrx.py +413 -0
vllm/model_executor/models/decilm.py +122 -0
vllm/model_executor/models/deepseek.py +438 -0
vllm/model_executor/models/falcon.py +444 -0
vllm/model_executor/models/gemma.py +393 -0
vllm/model_executor/models/gpt2.py +266 -0
vllm/model_executor/models/gpt_bigcode.py +274 -0
vllm/model_executor/models/gpt_j.py +281 -0
vllm/model_executor/models/gpt_neox.py +295 -0
vllm/model_executor/models/internlm2.py +323 -0
vllm/model_executor/models/jais.py +333 -0
vllm/model_executor/models/llama.py +442 -0
vllm/model_executor/models/llava.py +239 -0
vllm/model_executor/models/minicpm.py +531 -0
vllm/model_executor/models/mixtral.py +583 -0
vllm/model_executor/models/mixtral_quant.py +404 -0
vllm/model_executor/models/mpt.py +295 -0
vllm/model_executor/models/olmo.py +356 -0
vllm/model_executor/models/opt.py +349 -0
vllm/model_executor/models/orion.py +319 -0
vllm/model_executor/models/phi.py +300 -0
vllm/model_executor/models/qwen.py +284 -0
vllm/model_executor/models/qwen2.py +367 -0
vllm/model_executor/models/qwen2_moe.py +447 -0
vllm/model_executor/models/stablelm.py +301 -0
vllm/model_executor/models/starcoder2.py +302 -0
vllm/model_executor/models/xverse.py +366 -0
vllm/model_executor/sampling_metadata.py +588 -0
vllm/model_executor/utils.py +35 -0
vllm/outputs.py +150 -0
vllm/py.typed +2 -0
vllm/sampling_params.py +340 -0
vllm/sequence.py +766 -0
vllm/spec_decode/__init__.py +0 -0
vllm/spec_decode/batch_expansion.py +397 -0
vllm/spec_decode/interfaces.py +73 -0
vllm/spec_decode/metrics.py +191 -0
vllm/spec_decode/multi_step_worker.py +203 -0
vllm/spec_decode/ngram_worker.py +176 -0
vllm/spec_decode/spec_decode_worker.py +472 -0
vllm/spec_decode/top1_proposer.py +200 -0
vllm/spec_decode/util.py +228 -0
vllm/test_utils.py +41 -0
vllm/transformers_utils/__init__.py +0 -0
vllm/transformers_utils/config.py +58 -0
vllm/transformers_utils/configs/__init__.py +16 -0
vllm/transformers_utils/configs/chatglm.py +68 -0
vllm/transformers_utils/configs/dbrx.py +278 -0
vllm/transformers_utils/configs/falcon.py +87 -0
vllm/transformers_utils/configs/jais.py +236 -0
vllm/transformers_utils/configs/mpt.py +178 -0
vllm/transformers_utils/detokenizer.py +313 -0
vllm/transformers_utils/tokenizer.py +149 -0
vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
vllm/transformers_utils/tokenizers/__init__.py +5 -0
vllm/transformers_utils/tokenizers/baichuan.py +255 -0
vllm/usage/__init__.py +0 -0
vllm/usage/usage_lib.py +209 -0
vllm/utils.py +677 -0
vllm/worker/__init__.py +0 -0
vllm/worker/cache_engine.py +105 -0
vllm/worker/cpu_model_runner.py +346 -0
vllm/worker/cpu_worker.py +321 -0
vllm/worker/model_runner.py +1168 -0
vllm/worker/neuron_model_runner.py +196 -0
vllm/worker/neuron_worker.py +98 -0
vllm/worker/worker.py +345 -0
vllm/worker/worker_base.py +146 -0
vllm_npu-0.4.2.dist-info/LICENSE +201 -0
vllm_npu-0.4.2.dist-info/METADATA +173 -0
vllm_npu-0.4.2.dist-info/RECORD +219 -0
vllm_npu-0.4.2.dist-info/WHEEL +5 -0
vllm_npu-0.4.2.dist-info/top_level.txt +1 -0

vllm/engine/metrics.py ADDED Viewed

@@ -0,0 +1,368 @@
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+from typing import Counter as CollectionsCounter
+from typing import Dict, List, Optional, Protocol, Union
+import numpy as np
+from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
+                               disable_created_metrics)
+from vllm.logger import init_logger
+if TYPE_CHECKING:
+    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
+logger = init_logger(__name__)
+disable_created_metrics()
+# The begin-* and end* here are used by the documentation generator
+# to extract the metrics definitions.
+# begin-metrics-definitions
+class Metrics:
+    labelname_finish_reason = "finished_reason"
+    def __init__(self, labelnames: List[str], max_model_len: int):
+        # Unregister any existing vLLM collectors
+        for collector in list(REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                REGISTRY.unregister(collector)
+        # Config Information
+        self.info_cache_config = Info(
+            name='vllm:cache_config',
+            documentation='information of cache_config')
+        # System stats
+        #   Scheduler State
+        self.gauge_scheduler_running = Gauge(
+            name="vllm:num_requests_running",
+            documentation="Number of requests currently running on GPU.",
+            labelnames=labelnames)
+        self.gauge_scheduler_waiting = Gauge(
+            name="vllm:num_requests_waiting",
+            documentation="Number of requests waiting to be processed.",
+            labelnames=labelnames)
+        self.gauge_scheduler_swapped = Gauge(
+            name="vllm:num_requests_swapped",
+            documentation="Number of requests swapped to CPU.",
+            labelnames=labelnames)
+        #   KV Cache Usage in %
+        self.gauge_gpu_cache_usage = Gauge(
+            name="vllm:gpu_cache_usage_perc",
+            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
+            labelnames=labelnames)
+        self.gauge_cpu_cache_usage = Gauge(
+            name="vllm:cpu_cache_usage_perc",
+            documentation="CPU KV-cache usage. 1 means 100 percent usage.",
+            labelnames=labelnames)
+        # Iteration stats
+        self.counter_prompt_tokens = Counter(
+            name="vllm:prompt_tokens_total",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames)
+        self.counter_generation_tokens = Counter(
+            name="vllm:generation_tokens_total",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames)
+        self.histogram_time_to_first_token = Histogram(
+            name="vllm:time_to_first_token_seconds",
+            documentation="Histogram of time to first token in seconds.",
+            labelnames=labelnames,
+            buckets=[
+                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+            ])
+        self.histogram_time_per_output_token = Histogram(
+            name="vllm:time_per_output_token_seconds",
+            documentation="Histogram of time per output token in seconds.",
+            labelnames=labelnames,
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5
+            ])
+        # Request stats
+        #   Latency
+        self.histogram_e2e_time_request = Histogram(
+            name="vllm:e2e_request_latency_seconds",
+            documentation="Histogram of end to end request latency in seconds.",
+            labelnames=labelnames,
+            buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
+        #   Metadata
+        self.histogram_num_prompt_tokens_request = Histogram(
+            name="vllm:request_prompt_tokens",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+        self.histogram_num_generation_tokens_request = Histogram(
+            name="vllm:request_generation_tokens",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+        self.histogram_best_of_request = Histogram(
+            name="vllm:request_params_best_of",
+            documentation="Histogram of the best_of request parameter.",
+            labelnames=labelnames,
+            buckets=[1, 2, 5, 10, 20],
+        )
+        self.histogram_n_request = Histogram(
+            name="vllm:request_params_n",
+            documentation="Histogram of the n request parameter.",
+            labelnames=labelnames,
+            buckets=[1, 2, 5, 10, 20],
+        )
+        self.counter_request_success = Counter(
+            name="vllm:request_success_total",
+            documentation="Count of successfully processed requests.",
+            labelnames=labelnames + [Metrics.labelname_finish_reason])
+        # Deprecated in favor of vllm:prompt_tokens_total
+        self.gauge_avg_prompt_throughput = Gauge(
+            name="vllm:avg_prompt_throughput_toks_per_s",
+            documentation="Average prefill throughput in tokens/s.",
+            labelnames=labelnames,
+        )
+        # Deprecated in favor of vllm:generation_tokens_total
+        self.gauge_avg_generation_throughput = Gauge(
+            name="vllm:avg_generation_throughput_toks_per_s",
+            documentation="Average generation throughput in tokens/s.",
+            labelnames=labelnames,
+        )
+# end-metrics-definitions
+def build_1_2_5_buckets(max_value: int):
+    """
+    Builds a list of buckets with increasing powers of 10 multiplied by
+    mantissa values (1, 2, 5) until the value exceeds the specified maximum.
+    Example:
+    >>> build_1_2_5_buckets(100)
+    [1, 2, 5, 10, 20, 50, 100]
+    """
+    mantissa_lst = [1, 2, 5]
+    exponent = 0
+    buckets = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
+@dataclass
+class Stats:
+    """Created by LLMEngine for use by StatLogger."""
+    now: float
+    # System stats (should have _sys suffix)
+    #   Scheduler State
+    num_running_sys: int
+    num_waiting_sys: int
+    num_swapped_sys: int
+    #   KV Cache Usage in %
+    gpu_cache_usage_sys: float
+    cpu_cache_usage_sys: float
+    # Iteration stats (should have _iter suffix)
+    num_prompt_tokens_iter: int
+    num_generation_tokens_iter: int
+    time_to_first_tokens_iter: List[float]
+    time_per_output_tokens_iter: List[float]
+    # Request stats (should have _requests suffix)
+    #   Latency
+    time_e2e_requests: List[float]
+    #   Metadata
+    num_prompt_tokens_requests: List[int]
+    num_generation_tokens_requests: List[int]
+    best_of_requests: List[int]
+    n_requests: List[int]
+    finished_reason_requests: List[str]
+    spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+class SupportsMetricsInfo(Protocol):
+    def metrics_info(self) -> Dict[str, str]:
+        ...
+class StatLogger:
+    """StatLogger is used LLMEngine to log to Promethus and Stdout."""
+    def __init__(self, local_interval: float, labels: Dict[str, str],
+                 max_model_len: int) -> None:
+        # Metadata for logging locally.
+        self.last_local_log = time.time()
+        self.local_interval = local_interval
+        # Tracked stats over current local logging interval.
+        self.num_prompt_tokens: List[int] = []
+        self.num_generation_tokens: List[int] = []
+        # Prometheus metrics
+        self.labels = labels
+        self.metrics = Metrics(labelnames=list(labels.keys()),
+                               max_model_len=max_model_len)
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        if type == "cache_config":
+            self.metrics.info_cache_config.info(obj.metrics_info())
+    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
+        return float(np.sum(tracked_stats) / (now - self.last_local_log))
+    def _local_interval_elapsed(self, now: float) -> bool:
+        elapsed_time = now - self.last_local_log
+        return elapsed_time > self.local_interval
+    def _log_prometheus(self, stats: Stats) -> None:
+        # System state data
+        self._log_gauge(self.metrics.gauge_scheduler_running,
+                        stats.num_running_sys)
+        self._log_gauge(self.metrics.gauge_scheduler_swapped,
+                        stats.num_swapped_sys)
+        self._log_gauge(self.metrics.gauge_scheduler_waiting,
+                        stats.num_waiting_sys)
+        self._log_gauge(self.metrics.gauge_gpu_cache_usage,
+                        stats.gpu_cache_usage_sys)
+        self._log_gauge(self.metrics.gauge_cpu_cache_usage,
+                        stats.cpu_cache_usage_sys)
+        # Iteration level data
+        self._log_counter(self.metrics.counter_prompt_tokens,
+                          stats.num_prompt_tokens_iter)
+        self._log_counter(self.metrics.counter_generation_tokens,
+                          stats.num_generation_tokens_iter)
+        self._log_histogram(self.metrics.histogram_time_to_first_token,
+                            stats.time_to_first_tokens_iter)
+        self._log_histogram(self.metrics.histogram_time_per_output_token,
+                            stats.time_per_output_tokens_iter)
+        # Request level data
+        # Latency
+        self._log_histogram(self.metrics.histogram_e2e_time_request,
+                            stats.time_e2e_requests)
+        # Metadata
+        finished_reason_counter = CollectionsCounter(
+            stats.finished_reason_requests)
+        self._log_counter_labels(self.metrics.counter_request_success,
+                                 finished_reason_counter,
+                                 Metrics.labelname_finish_reason)
+        self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
+                            stats.num_prompt_tokens_requests)
+        self._log_histogram(
+            self.metrics.histogram_num_generation_tokens_request,
+            stats.num_generation_tokens_requests)
+        self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
+        self._log_histogram(self.metrics.histogram_best_of_request,
+                            stats.best_of_requests)
+    def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None:
+        # Convenience function for logging to gauge.
+        gauge.labels(**self.labels).set(data)
+    def _log_counter(self, counter: Counter, data: Union[int, float]) -> None:
+        # Convenience function for logging to counter.
+        counter.labels(**self.labels).inc(data)
+    def _log_counter_labels(self, counter: Counter, data: CollectionsCounter,
+                            label_key: str) -> None:
+        # Convenience function for collection counter of labels.
+        for label, count in data.items():
+            counter.labels(**{**self.labels, label_key: label}).inc(count)
+    def _log_histogram(self, histogram: Histogram,
+                       data: Union[List[int], List[float]]) -> None:
+        # Convenience function for logging list to histogram.
+        for datum in data:
+            histogram.labels(**self.labels).observe(datum)
+    def _log_prometheus_interval(self, prompt_throughput: float,
+                                 generation_throughput: float) -> None:
+        # Logs metrics to prometheus that are computed every logging_interval.
+        # Support legacy gauge metrics that make throughput calculations on
+        # the vLLM side. Moving forward, we should use counters like
+        # counter_prompt_tokens, counter_generation_tokens
+        # Which log raw data and calculate summaries using rate() on the
+        # grafana/prometheus side. See
+        # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
+        self.metrics.gauge_avg_prompt_throughput.labels(
+            **self.labels).set(prompt_throughput)
+        self.metrics.gauge_avg_generation_throughput.labels(
+            **self.labels).set(generation_throughput)
+    def log(self, stats: Stats) -> None:
+        """Called by LLMEngine.
+           Logs to prometheus and tracked stats every iteration.
+           Logs to Stdout every self.local_interval seconds."""
+        # Log to prometheus.
+        self._log_prometheus(stats)
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
+        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
+        # Log locally every local_interval seconds.
+        if self._local_interval_elapsed(stats.now):
+            # Compute summary metrics for tracked stats (and log them
+            # to promethus if applicable).
+            prompt_throughput = self._get_throughput(self.num_prompt_tokens,
+                                                     now=stats.now)
+            generation_throughput = self._get_throughput(
+                self.num_generation_tokens, now=stats.now)
+            self._log_prometheus_interval(
+                prompt_throughput=prompt_throughput,
+                generation_throughput=generation_throughput)
+            # Log to stdout.
+            logger.info(
+                "Avg prompt throughput: %.1f tokens/s, "
+                "Avg generation throughput: %.1f tokens/s, "
+                "Running: %d reqs, Swapped: %d reqs, "
+                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
+                "CPU KV cache usage: %.1f%%",
+                prompt_throughput,
+                generation_throughput,
+                stats.num_running_sys,
+                stats.num_swapped_sys,
+                stats.num_waiting_sys,
+                stats.gpu_cache_usage_sys * 100,
+                stats.cpu_cache_usage_sys * 100,
+            )
+            # Reset tracked stats for next interval.
+            self.num_prompt_tokens = []
+            self.num_generation_tokens = []
+            self.last_local_log = stats.now
+            if stats.spec_decode_metrics is not None:
+                logger.info(
+                    self._format_spec_decode_metrics_str(
+                        stats.spec_decode_metrics))
+    def _format_spec_decode_metrics_str(
+            self, metrics: "SpecDecodeWorkerMetrics") -> str:
+        return ("Speculative metrics: "
+                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
+                f"System efficiency: {metrics.system_efficiency:.3f}, "
+                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
+                f"Number of accepted tokens: {metrics.accepted_tokens}, "
+                f"Number of draft tokens tokens: {metrics.draft_tokens}, "
+                f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")

vllm/engine/output_processor/__init__.py ADDED Viewed

File without changes

vllm/engine/output_processor/interfaces.py ADDED Viewed

@@ -0,0 +1,76 @@
+from abc import ABC, abstractmethod
+from typing import Callable, List
+from transformers import PreTrainedTokenizer
+from vllm.config import SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+class SequenceGroupOutputProcessor(ABC):
+    """Interface for logic that processes new token ids in sequence groups,
+    managing detokenization, stop checking, and freeing/forking sequences with
+    the scheduler.
+    This is highly coupled with the LLMEngine and should be seen as an extension
+    of it. The logic is separated to simplify the LLMEngine class and allow
+    separate implementations for single-step decoding (which supports beam
+    search sequence forking) and multi-step decoding (which does not support
+    beam search, but does support speculative decoding).
+    """
+    @staticmethod
+    def create_output_processor(
+        scheduler_config: SchedulerConfig,
+        detokenizer: Detokenizer,
+        scheduler: Scheduler,
+        seq_counter: Counter,
+        get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
+        stop_checker: "StopChecker",
+    ):
+        """Create an output processor.
+        This returns a single-step output processor if num_lookahead_slots is
+        zero, else returns a multi-step output processor.
+        """
+        if scheduler_config.num_lookahead_slots == 0:
+            # Importing here to avoid cycle.
+            from vllm.engine.output_processor.single_step import (
+                SingleStepOutputProcessor)
+            return SingleStepOutputProcessor(
+                scheduler_config,
+                detokenizer,
+                scheduler,
+                seq_counter,
+                stop_checker,
+            )
+        else:
+            # Importing here to avoid cycle.
+            from vllm.engine.output_processor.multi_step import (
+                MultiStepOutputProcessor)
+            return MultiStepOutputProcessor(
+                detokenizer,
+                scheduler,
+                seq_counter,
+                get_tokenizer_for_seq,
+                stop_checker,
+            )
+    @abstractmethod
+    def process_outputs(self, sequence_group: SequenceGroup,
+                        outputs: List[SequenceGroupOutput]) -> None:
+        """Process new token ids for the sequence group. Handles logic such as
+        detokenization, stop checking, and freeing/forking sequences in the
+        scheduler.
+        """
+        pass
+    @abstractmethod
+    def process_prompt_logprob(self, seq_group: SequenceGroup,
+                               outputs: List[SequenceGroupOutput]) -> None:
+        """Update prompt logprobs received from outputs to seq_group."""
+        pass

vllm/engine/output_processor/multi_step.py ADDED Viewed

@@ -0,0 +1,142 @@
+import functools
+from typing import Callable, List
+from transformers import PreTrainedTokenizer
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.interfaces import (
+    SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
+                           SequenceOutput, SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+logger = init_logger(__name__)
+class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
+    """SequenceGroupOutputProcessor which handles logic related to
+    detokenization and stopping conditions. It specializes to "multi-step
+    decoding", where vLLM's worker may generate multiple tokens per invocation.
+    This is currently mutually exclusive with advanced sampling techniques like
+    beam search, which motivates the separation of this logic from the single
+    step output processor.
+    This class is responsible for things such as correctly appending all new
+    token ids to their sequence, detokenizing new token ids, truncating new
+    output tokens after an eos token, and correctly handling the case where the
+    number of new output tokens per sequence differs in a single batch.
+    """
+    def __init__(
+        self,
+        detokenizer: Detokenizer,
+        scheduler: Scheduler,
+        seq_counter: Counter,
+        get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
+        stop_checker: StopChecker,
+    ):
+        self.detokenizer = detokenizer
+        self.scheduler = scheduler
+        self.seq_counter = seq_counter
+        self.get_tokenizer_for_seq = get_tokenizer_for_seq
+        self.stop_checker = stop_checker
+    def process_prompt_logprob(self, seq_group: SequenceGroup,
+                               outputs: List[SequenceGroupOutput]) -> None:
+        # TODO(sang): Prompt logprob currently not implemented in multi step
+        # workers.
+        self._log_prompt_logprob_unsupported_warning_once()
+    @staticmethod
+    @functools.lru_cache()
+    def _log_prompt_logprob_unsupported_warning_once():
+        logger.warning(
+            "Prompt logprob is not supported by multi step workers. "
+            "(e.g., speculative decode uses multi step workers).")
+    def process_outputs(self, sequence_group: SequenceGroup,
+                        outputs: List[SequenceGroupOutput]) -> None:
+        """Append new tokens in the outputs to sequences in the sequence group.
+        This only supports sequence groups of size 1. It supports greater than
+        one new token per sequence.
+        This applies logic like stop condition checking and detokenization,
+        including freeing finished sequences. It also handles cases where there
+        are tokens emitted after the EOS token.
+        """
+        seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
+        assert seqs, "expected running sequences"
+        assert len(seqs) == 1, (
+            "Beam search not supported in multi-step decoding.")
+        seq = seqs[0]
+        # Since there's only one sequence per sequence group, we can take the
+        # first sample.
+        samples = [outputs[step].samples[0] for step in range(len(outputs))]
+        # -1 means the output token is not valid (eg. due to spec decode
+        # rejecting tokens).
+        valid_samples = [
+            sample for sample in samples if sample.output_token != -1
+        ]
+        assert valid_samples
+        self._process_seq_outputs(seq, valid_samples,
+                                  sequence_group.sampling_params)
+    def _process_seq_outputs(self, seq: Sequence,
+                             valid_samples: List[SequenceOutput],
+                             sampling_params: SamplingParams) -> None:
+        output_token_ids = [sample.output_token for sample in valid_samples]
+        output_logprobs = [sample.logprobs for sample in valid_samples]
+        # Truncate to max_tokens if necessary.
+        remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
+                                                         len(output_token_ids))
+        if remaining_tokens < 0:
+            valid_samples = valid_samples[:remaining_tokens]
+            output_token_ids = output_token_ids[:remaining_tokens]
+        # Truncate any tokens after EOS. This is required as spec decode
+        # generates a fixed number of tokens without evaluating stopping
+        # conditions within the block. This can cause an eos token to be
+        # unintentionally ignored.
+        if not sampling_params.ignore_eos:
+            eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
+            # Avoiding .index calls as exception throwing in the happy path
+            # is expensive.
+            for i in range(len(output_token_ids)):
+                if output_token_ids[i] == eos_token_id:
+                    output_token_ids = output_token_ids[:i + 1]
+                    valid_samples = valid_samples[:i + 1]
+                    break
+        # Incrementally append tokens to the sequence, as if we had only one new
+        # token.
+        for output_token_id, output_logprob in zip(output_token_ids,
+                                                   output_logprobs):
+            seq.append_token_id(
+                token_id=output_token_id,
+                logprobs=output_logprob,
+            )
+            new_char_count = 0
+            if sampling_params.detokenize:
+                new_char_count = self.detokenizer.decode_sequence_inplace(
+                    seq, sampling_params)
+            self.stop_checker.maybe_stop_sequence(
+                seq,
+                new_char_count=new_char_count,
+                sampling_params=sampling_params)
+            if seq.is_finished():
+                break
+        if seq.is_finished():
+            self.scheduler.free_seq(seq)